1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size < 32) {
125       // <2 x s8> -> s16
126       assert(Size == 16);
127       CoercedTy = LLT::scalar(16);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .clampScalar(0, S32, S256)
419     .widenScalarToNextPow2(0, 32)
420     .clampMaxNumElements(0, S32, 16)
421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422     .legalIf(isPointer(0));
423 
424   if (ST.hasVOP3PInsts()) {
425     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426       .legalFor({S32, S16, V2S16})
427       .clampScalar(0, S16, S32)
428       .clampMaxNumElements(0, S16, 2)
429       .scalarize(0)
430       .widenScalarToNextPow2(0, 32);
431   } else if (ST.has16BitInsts()) {
432     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
433       .legalFor({S32, S16})
434       .clampScalar(0, S16, S32)
435       .scalarize(0)
436       .widenScalarToNextPow2(0, 32);
437   } else {
438     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
439       .legalFor({S32})
440       .clampScalar(0, S32, S32)
441       .scalarize(0);
442   }
443 
444   // FIXME: Not really legal. Placeholder for custom lowering.
445   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
446     .customFor({S32, S64})
447     .clampScalar(0, S32, S64)
448     .widenScalarToNextPow2(0, 32)
449     .scalarize(0);
450 
451   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
452     .legalFor({S32})
453     .clampScalar(0, S32, S32)
454     .scalarize(0);
455 
456   // Report legal for any types we can handle anywhere. For the cases only legal
457   // on the SALU, RegBankSelect will be able to re-legalize.
458   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
459     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
460     .clampScalar(0, S32, S64)
461     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
462     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
463     .widenScalarToNextPow2(0)
464     .scalarize(0);
465 
466   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
467                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
468     .legalFor({{S32, S1}, {S32, S32}})
469     .minScalar(0, S32)
470     // TODO: .scalarize(0)
471     .lower();
472 
473   getActionDefinitionsBuilder(G_BITCAST)
474     // Don't worry about the size constraint.
475     .legalIf(all(isRegisterType(0), isRegisterType(1)))
476     .lower();
477 
478 
479   getActionDefinitionsBuilder(G_CONSTANT)
480     .legalFor({S1, S32, S64, S16, GlobalPtr,
481                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
482     .clampScalar(0, S32, S64)
483     .widenScalarToNextPow2(0)
484     .legalIf(isPointer(0));
485 
486   getActionDefinitionsBuilder(G_FCONSTANT)
487     .legalFor({S32, S64, S16})
488     .clampScalar(0, S16, S64);
489 
490   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
491       .legalIf(isRegisterType(0))
492       // s1 and s16 are special cases because they have legal operations on
493       // them, but don't really occupy registers in the normal way.
494       .legalFor({S1, S16})
495       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
496       .clampScalarOrElt(0, S32, MaxScalar)
497       .widenScalarToNextPow2(0, 32)
498       .clampMaxNumElements(0, S32, 16);
499 
500   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
501 
502   // If the amount is divergent, we have to do a wave reduction to get the
503   // maximum value, so this is expanded during RegBankSelect.
504   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
505     .legalFor({{PrivatePtr, S32}});
506 
507   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
508     .unsupportedFor({PrivatePtr})
509     .custom();
510   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
511 
512   auto &FPOpActions = getActionDefinitionsBuilder(
513     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
514     .legalFor({S32, S64});
515   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
516     .customFor({S32, S64});
517   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
518     .customFor({S32, S64});
519 
520   if (ST.has16BitInsts()) {
521     if (ST.hasVOP3PInsts())
522       FPOpActions.legalFor({S16, V2S16});
523     else
524       FPOpActions.legalFor({S16});
525 
526     TrigActions.customFor({S16});
527     FDIVActions.customFor({S16});
528   }
529 
530   auto &MinNumMaxNum = getActionDefinitionsBuilder({
531       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
532 
533   if (ST.hasVOP3PInsts()) {
534     MinNumMaxNum.customFor(FPTypesPK16)
535       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
536       .clampMaxNumElements(0, S16, 2)
537       .clampScalar(0, S16, S64)
538       .scalarize(0);
539   } else if (ST.has16BitInsts()) {
540     MinNumMaxNum.customFor(FPTypes16)
541       .clampScalar(0, S16, S64)
542       .scalarize(0);
543   } else {
544     MinNumMaxNum.customFor(FPTypesBase)
545       .clampScalar(0, S32, S64)
546       .scalarize(0);
547   }
548 
549   if (ST.hasVOP3PInsts())
550     FPOpActions.clampMaxNumElements(0, S16, 2);
551 
552   FPOpActions
553     .scalarize(0)
554     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
555 
556   TrigActions
557     .scalarize(0)
558     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
559 
560   FDIVActions
561     .scalarize(0)
562     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
563 
564   getActionDefinitionsBuilder({G_FNEG, G_FABS})
565     .legalFor(FPTypesPK16)
566     .clampMaxNumElements(0, S16, 2)
567     .scalarize(0)
568     .clampScalar(0, S16, S64);
569 
570   if (ST.has16BitInsts()) {
571     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
572       .legalFor({S32, S64, S16})
573       .scalarize(0)
574       .clampScalar(0, S16, S64);
575   } else {
576     getActionDefinitionsBuilder(G_FSQRT)
577       .legalFor({S32, S64})
578       .scalarize(0)
579       .clampScalar(0, S32, S64);
580 
581     if (ST.hasFractBug()) {
582       getActionDefinitionsBuilder(G_FFLOOR)
583         .customFor({S64})
584         .legalFor({S32, S64})
585         .scalarize(0)
586         .clampScalar(0, S32, S64);
587     } else {
588       getActionDefinitionsBuilder(G_FFLOOR)
589         .legalFor({S32, S64})
590         .scalarize(0)
591         .clampScalar(0, S32, S64);
592     }
593   }
594 
595   getActionDefinitionsBuilder(G_FPTRUNC)
596     .legalFor({{S32, S64}, {S16, S32}})
597     .scalarize(0)
598     .lower();
599 
600   getActionDefinitionsBuilder(G_FPEXT)
601     .legalFor({{S64, S32}, {S32, S16}})
602     .lowerFor({{S64, S16}}) // FIXME: Implement
603     .scalarize(0);
604 
605   getActionDefinitionsBuilder(G_FSUB)
606       // Use actual fsub instruction
607       .legalFor({S32})
608       // Must use fadd + fneg
609       .lowerFor({S64, S16, V2S16})
610       .scalarize(0)
611       .clampScalar(0, S32, S64);
612 
613   // Whether this is legal depends on the floating point mode for the function.
614   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
615   if (ST.hasMadF16())
616     FMad.customFor({S32, S16});
617   else
618     FMad.customFor({S32});
619   FMad.scalarize(0)
620       .lower();
621 
622   // TODO: Do we need to clamp maximum bitwidth?
623   getActionDefinitionsBuilder(G_TRUNC)
624     .legalIf(isScalar(0))
625     .legalFor({{V2S16, V2S32}})
626     .clampMaxNumElements(0, S16, 2)
627     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
628     // situations (like an invalid implicit use), we don't want to infinite loop
629     // in the legalizer.
630     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
631     .alwaysLegal();
632 
633   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
634     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
635                {S32, S1}, {S64, S1}, {S16, S1}})
636     .scalarize(0)
637     .clampScalar(0, S32, S64)
638     .widenScalarToNextPow2(1, 32);
639 
640   // TODO: Split s1->s64 during regbankselect for VALU.
641   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
642     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
643     .lowerFor({{S32, S64}})
644     .lowerIf(typeIs(1, S1))
645     .customFor({{S64, S64}});
646   if (ST.has16BitInsts())
647     IToFP.legalFor({{S16, S16}});
648   IToFP.clampScalar(1, S32, S64)
649        .scalarize(0)
650        .widenScalarToNextPow2(1);
651 
652   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
653     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
654     .customFor({{S64, S64}});
655   if (ST.has16BitInsts())
656     FPToI.legalFor({{S16, S16}});
657   else
658     FPToI.minScalar(1, S32);
659 
660   FPToI.minScalar(0, S32)
661        .scalarize(0)
662        .lower();
663 
664   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
665     .scalarize(0)
666     .lower();
667 
668   if (ST.has16BitInsts()) {
669     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
670       .legalFor({S16, S32, S64})
671       .clampScalar(0, S16, S64)
672       .scalarize(0);
673   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
674     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
675       .legalFor({S32, S64})
676       .clampScalar(0, S32, S64)
677       .scalarize(0);
678   } else {
679     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
680       .legalFor({S32})
681       .customFor({S64})
682       .clampScalar(0, S32, S64)
683       .scalarize(0);
684   }
685 
686   // FIXME: Clamp offset operand.
687   getActionDefinitionsBuilder(G_PTR_ADD)
688     .legalIf(isPointer(0))
689     .scalarize(0);
690 
691   getActionDefinitionsBuilder(G_PTRMASK)
692     .legalIf(typeInSet(1, {S64, S32}))
693     .minScalar(1, S32)
694     .maxScalarIf(sizeIs(0, 32), 1, S32)
695     .maxScalarIf(sizeIs(0, 64), 1, S64)
696     .scalarize(0);
697 
698   auto &CmpBuilder =
699     getActionDefinitionsBuilder(G_ICMP)
700     // The compare output type differs based on the register bank of the output,
701     // so make both s1 and s32 legal.
702     //
703     // Scalar compares producing output in scc will be promoted to s32, as that
704     // is the allocatable register type that will be needed for the copy from
705     // scc. This will be promoted during RegBankSelect, and we assume something
706     // before that won't try to use s32 result types.
707     //
708     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
709     // bank.
710     .legalForCartesianProduct(
711       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
712     .legalForCartesianProduct(
713       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
714   if (ST.has16BitInsts()) {
715     CmpBuilder.legalFor({{S1, S16}});
716   }
717 
718   CmpBuilder
719     .widenScalarToNextPow2(1)
720     .clampScalar(1, S32, S64)
721     .scalarize(0)
722     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
723 
724   getActionDefinitionsBuilder(G_FCMP)
725     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
726     .widenScalarToNextPow2(1)
727     .clampScalar(1, S32, S64)
728     .scalarize(0);
729 
730   // FIXME: fpow has a selection pattern that should move to custom lowering.
731   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
732   if (ST.has16BitInsts())
733     Exp2Ops.legalFor({S32, S16});
734   else
735     Exp2Ops.legalFor({S32});
736   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
737   Exp2Ops.scalarize(0);
738 
739   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
740   if (ST.has16BitInsts())
741     ExpOps.customFor({{S32}, {S16}});
742   else
743     ExpOps.customFor({S32});
744   ExpOps.clampScalar(0, MinScalarFPTy, S32)
745         .scalarize(0);
746 
747   // The 64-bit versions produce 32-bit results, but only on the SALU.
748   getActionDefinitionsBuilder(G_CTPOP)
749     .legalFor({{S32, S32}, {S32, S64}})
750     .clampScalar(0, S32, S32)
751     .clampScalar(1, S32, S64)
752     .scalarize(0)
753     .widenScalarToNextPow2(0, 32)
754     .widenScalarToNextPow2(1, 32);
755 
756   // The hardware instructions return a different result on 0 than the generic
757   // instructions expect. The hardware produces -1, but these produce the
758   // bitwidth.
759   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
760     .scalarize(0)
761     .clampScalar(0, S32, S32)
762     .clampScalar(1, S32, S64)
763     .widenScalarToNextPow2(0, 32)
764     .widenScalarToNextPow2(1, 32)
765     .lower();
766 
767   // The 64-bit versions produce 32-bit results, but only on the SALU.
768   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
769     .legalFor({{S32, S32}, {S32, S64}})
770     .clampScalar(0, S32, S32)
771     .clampScalar(1, S32, S64)
772     .scalarize(0)
773     .widenScalarToNextPow2(0, 32)
774     .widenScalarToNextPow2(1, 32);
775 
776   getActionDefinitionsBuilder(G_BITREVERSE)
777     .legalFor({S32})
778     .clampScalar(0, S32, S32)
779     .scalarize(0);
780 
781   if (ST.has16BitInsts()) {
782     getActionDefinitionsBuilder(G_BSWAP)
783       .legalFor({S16, S32, V2S16})
784       .clampMaxNumElements(0, S16, 2)
785       // FIXME: Fixing non-power-of-2 before clamp is workaround for
786       // narrowScalar limitation.
787       .widenScalarToNextPow2(0)
788       .clampScalar(0, S16, S32)
789       .scalarize(0);
790 
791     if (ST.hasVOP3PInsts()) {
792       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
793         .legalFor({S32, S16, V2S16})
794         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
795         .clampMaxNumElements(0, S16, 2)
796         .minScalar(0, S16)
797         .widenScalarToNextPow2(0)
798         .scalarize(0)
799         .lower();
800     } else {
801       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
802         .legalFor({S32, S16})
803         .widenScalarToNextPow2(0)
804         .minScalar(0, S16)
805         .scalarize(0)
806         .lower();
807     }
808   } else {
809     // TODO: Should have same legality without v_perm_b32
810     getActionDefinitionsBuilder(G_BSWAP)
811       .legalFor({S32})
812       .lowerIf(scalarNarrowerThan(0, 32))
813       // FIXME: Fixing non-power-of-2 before clamp is workaround for
814       // narrowScalar limitation.
815       .widenScalarToNextPow2(0)
816       .maxScalar(0, S32)
817       .scalarize(0)
818       .lower();
819 
820     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
821       .legalFor({S32})
822       .minScalar(0, S32)
823       .widenScalarToNextPow2(0)
824       .scalarize(0)
825       .lower();
826   }
827 
828   getActionDefinitionsBuilder(G_INTTOPTR)
829     // List the common cases
830     .legalForCartesianProduct(AddrSpaces64, {S64})
831     .legalForCartesianProduct(AddrSpaces32, {S32})
832     .scalarize(0)
833     // Accept any address space as long as the size matches
834     .legalIf(sameSize(0, 1))
835     .widenScalarIf(smallerThan(1, 0),
836       [](const LegalityQuery &Query) {
837         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
838       })
839     .narrowScalarIf(largerThan(1, 0),
840       [](const LegalityQuery &Query) {
841         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
842       });
843 
844   getActionDefinitionsBuilder(G_PTRTOINT)
845     // List the common cases
846     .legalForCartesianProduct(AddrSpaces64, {S64})
847     .legalForCartesianProduct(AddrSpaces32, {S32})
848     .scalarize(0)
849     // Accept any address space as long as the size matches
850     .legalIf(sameSize(0, 1))
851     .widenScalarIf(smallerThan(0, 1),
852       [](const LegalityQuery &Query) {
853         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
854       })
855     .narrowScalarIf(
856       largerThan(0, 1),
857       [](const LegalityQuery &Query) {
858         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
859       });
860 
861   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
862     .scalarize(0)
863     .custom();
864 
865   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
866                                     bool IsLoad) -> bool {
867     const LLT DstTy = Query.Types[0];
868 
869     // Split vector extloads.
870     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
871     unsigned Align = Query.MMODescrs[0].AlignInBits;
872 
873     if (MemSize < DstTy.getSizeInBits())
874       MemSize = std::max(MemSize, Align);
875 
876     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
877       return true;
878 
879     const LLT PtrTy = Query.Types[1];
880     unsigned AS = PtrTy.getAddressSpace();
881     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
882       return true;
883 
884     // Catch weird sized loads that don't evenly divide into the access sizes
885     // TODO: May be able to widen depending on alignment etc.
886     unsigned NumRegs = (MemSize + 31) / 32;
887     if (NumRegs == 3) {
888       if (!ST.hasDwordx3LoadStores())
889         return true;
890     } else {
891       // If the alignment allows, these should have been widened.
892       if (!isPowerOf2_32(NumRegs))
893         return true;
894     }
895 
896     if (Align < MemSize) {
897       const SITargetLowering *TLI = ST.getTargetLowering();
898       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
899     }
900 
901     return false;
902   };
903 
904   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
905                                          unsigned Opc) -> bool {
906     unsigned Size = Query.Types[0].getSizeInBits();
907     if (isPowerOf2_32(Size))
908       return false;
909 
910     if (Size == 96 && ST.hasDwordx3LoadStores())
911       return false;
912 
913     unsigned AddrSpace = Query.Types[1].getAddressSpace();
914     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
915       return false;
916 
917     unsigned Align = Query.MMODescrs[0].AlignInBits;
918     unsigned RoundedSize = NextPowerOf2(Size);
919     return (Align >= RoundedSize);
920   };
921 
922   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
923   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
924   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
925 
926   // TODO: Refine based on subtargets which support unaligned access or 128-bit
927   // LDS
928   // TODO: Unsupported flat for SI.
929 
930   for (unsigned Op : {G_LOAD, G_STORE}) {
931     const bool IsStore = Op == G_STORE;
932 
933     auto &Actions = getActionDefinitionsBuilder(Op);
934     // Whitelist some common cases.
935     // TODO: Does this help compile time at all?
936     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
937                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
938                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
939                                       {S64, GlobalPtr, 64, GlobalAlign32},
940                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
941                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
942                                       {S32, GlobalPtr, 8, GlobalAlign8},
943                                       {S32, GlobalPtr, 16, GlobalAlign16},
944 
945                                       {S32, LocalPtr, 32, 32},
946                                       {S64, LocalPtr, 64, 32},
947                                       {V2S32, LocalPtr, 64, 32},
948                                       {S32, LocalPtr, 8, 8},
949                                       {S32, LocalPtr, 16, 16},
950                                       {V2S16, LocalPtr, 32, 32},
951 
952                                       {S32, PrivatePtr, 32, 32},
953                                       {S32, PrivatePtr, 8, 8},
954                                       {S32, PrivatePtr, 16, 16},
955                                       {V2S16, PrivatePtr, 32, 32},
956 
957                                       {S32, ConstantPtr, 32, GlobalAlign32},
958                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
959                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
960                                       {S64, ConstantPtr, 64, GlobalAlign32},
961                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
962     Actions.legalIf(
963       [=](const LegalityQuery &Query) -> bool {
964         return isLoadStoreLegal(ST, Query, Op);
965       });
966 
967     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
968     // 64-bits.
969     //
970     // TODO: Should generalize bitcast action into coerce, which will also cover
971     // inserting addrspacecasts.
972     Actions.customIf(typeIs(1, Constant32Ptr));
973 
974     // Turn any illegal element vectors into something easier to deal
975     // with. These will ultimately produce 32-bit scalar shifts to extract the
976     // parts anyway.
977     //
978     // For odd 16-bit element vectors, prefer to split those into pieces with
979     // 16-bit vector parts.
980     Actions.bitcastIf(
981       [=](const LegalityQuery &Query) -> bool {
982         const LLT Ty = Query.Types[0];
983 
984         // Do not cast an extload/truncstore.
985         if (Ty.getSizeInBits() != Query.MMODescrs[0].SizeInBits)
986           return false;
987 
988         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
989           return true;
990         const unsigned Size = Ty.getSizeInBits();
991         return Ty.isVector() && isRegisterSize(Size) &&
992                !isRegisterVectorElementType(Ty.getElementType());
993       }, bitcastToRegisterType(0));
994 
995     Actions
996         .customIf(typeIs(1, Constant32Ptr))
997         // Widen suitably aligned loads by loading extra elements.
998         .moreElementsIf([=](const LegalityQuery &Query) {
999             const LLT Ty = Query.Types[0];
1000             return Op == G_LOAD && Ty.isVector() &&
1001                    shouldWidenLoadResult(Query, Op);
1002           }, moreElementsToNextPow2(0))
1003         .widenScalarIf([=](const LegalityQuery &Query) {
1004             const LLT Ty = Query.Types[0];
1005             return Op == G_LOAD && !Ty.isVector() &&
1006                    shouldWidenLoadResult(Query, Op);
1007           }, widenScalarOrEltToNextPow2(0))
1008         .narrowScalarIf(
1009             [=](const LegalityQuery &Query) -> bool {
1010               return !Query.Types[0].isVector() &&
1011                      needToSplitMemOp(Query, Op == G_LOAD);
1012             },
1013             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1014               const LLT DstTy = Query.Types[0];
1015               const LLT PtrTy = Query.Types[1];
1016 
1017               const unsigned DstSize = DstTy.getSizeInBits();
1018               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1019 
1020               // Split extloads.
1021               if (DstSize > MemSize)
1022                 return std::make_pair(0, LLT::scalar(MemSize));
1023 
1024               if (!isPowerOf2_32(DstSize)) {
1025                 // We're probably decomposing an odd sized store. Try to split
1026                 // to the widest type. TODO: Account for alignment. As-is it
1027                 // should be OK, since the new parts will be further legalized.
1028                 unsigned FloorSize = PowerOf2Floor(DstSize);
1029                 return std::make_pair(0, LLT::scalar(FloorSize));
1030               }
1031 
1032               if (DstSize > 32 && (DstSize % 32 != 0)) {
1033                 // FIXME: Need a way to specify non-extload of larger size if
1034                 // suitably aligned.
1035                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1036               }
1037 
1038               unsigned MaxSize = maxSizeForAddrSpace(ST,
1039                                                      PtrTy.getAddressSpace(),
1040                                                      Op == G_LOAD);
1041               if (MemSize > MaxSize)
1042                 return std::make_pair(0, LLT::scalar(MaxSize));
1043 
1044               unsigned Align = Query.MMODescrs[0].AlignInBits;
1045               return std::make_pair(0, LLT::scalar(Align));
1046             })
1047         .fewerElementsIf(
1048             [=](const LegalityQuery &Query) -> bool {
1049               return Query.Types[0].isVector() &&
1050                      needToSplitMemOp(Query, Op == G_LOAD);
1051             },
1052             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1053               const LLT DstTy = Query.Types[0];
1054               const LLT PtrTy = Query.Types[1];
1055 
1056               LLT EltTy = DstTy.getElementType();
1057               unsigned MaxSize = maxSizeForAddrSpace(ST,
1058                                                      PtrTy.getAddressSpace(),
1059                                                      Op == G_LOAD);
1060 
1061               // FIXME: Handle widened to power of 2 results better. This ends
1062               // up scalarizing.
1063               // FIXME: 3 element stores scalarized on SI
1064 
1065               // Split if it's too large for the address space.
1066               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1067                 unsigned NumElts = DstTy.getNumElements();
1068                 unsigned EltSize = EltTy.getSizeInBits();
1069 
1070                 if (MaxSize % EltSize == 0) {
1071                   return std::make_pair(
1072                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1073                 }
1074 
1075                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1076 
1077                 // FIXME: Refine when odd breakdowns handled
1078                 // The scalars will need to be re-legalized.
1079                 if (NumPieces == 1 || NumPieces >= NumElts ||
1080                     NumElts % NumPieces != 0)
1081                   return std::make_pair(0, EltTy);
1082 
1083                 return std::make_pair(0,
1084                                       LLT::vector(NumElts / NumPieces, EltTy));
1085               }
1086 
1087               // FIXME: We could probably handle weird extending loads better.
1088               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1089               if (DstTy.getSizeInBits() > MemSize)
1090                 return std::make_pair(0, EltTy);
1091 
1092               unsigned EltSize = EltTy.getSizeInBits();
1093               unsigned DstSize = DstTy.getSizeInBits();
1094               if (!isPowerOf2_32(DstSize)) {
1095                 // We're probably decomposing an odd sized store. Try to split
1096                 // to the widest type. TODO: Account for alignment. As-is it
1097                 // should be OK, since the new parts will be further legalized.
1098                 unsigned FloorSize = PowerOf2Floor(DstSize);
1099                 return std::make_pair(
1100                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1101               }
1102 
1103               // Need to split because of alignment.
1104               unsigned Align = Query.MMODescrs[0].AlignInBits;
1105               if (EltSize > Align &&
1106                   (EltSize / Align < DstTy.getNumElements())) {
1107                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1108               }
1109 
1110               // May need relegalization for the scalars.
1111               return std::make_pair(0, EltTy);
1112             })
1113         .minScalar(0, S32);
1114 
1115     if (IsStore)
1116       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1117 
1118     // TODO: Need a bitcast lower option?
1119     Actions
1120         .widenScalarToNextPow2(0)
1121         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1122   }
1123 
1124   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1125                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1126                                                   {S32, GlobalPtr, 16, 2 * 8},
1127                                                   {S32, LocalPtr, 8, 8},
1128                                                   {S32, LocalPtr, 16, 16},
1129                                                   {S32, PrivatePtr, 8, 8},
1130                                                   {S32, PrivatePtr, 16, 16},
1131                                                   {S32, ConstantPtr, 8, 8},
1132                                                   {S32, ConstantPtr, 16, 2 * 8}});
1133   if (ST.hasFlatAddressSpace()) {
1134     ExtLoads.legalForTypesWithMemDesc(
1135         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1136   }
1137 
1138   ExtLoads.clampScalar(0, S32, S32)
1139           .widenScalarToNextPow2(0)
1140           .unsupportedIfMemSizeNotPow2()
1141           .lower();
1142 
1143   auto &Atomics = getActionDefinitionsBuilder(
1144     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1145      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1146      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1147      G_ATOMICRMW_UMIN})
1148     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1149                {S64, GlobalPtr}, {S64, LocalPtr}});
1150   if (ST.hasFlatAddressSpace()) {
1151     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1152   }
1153 
1154   if (ST.hasLDSFPAtomics()) {
1155     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1156       .legalFor({{S32, LocalPtr}});
1157   }
1158 
1159   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1160   // demarshalling
1161   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1162     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1163                 {S32, FlatPtr}, {S64, FlatPtr}})
1164     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1165                {S32, RegionPtr}, {S64, RegionPtr}});
1166   // TODO: Pointer types, any 32-bit or 64-bit vector
1167 
1168   // Condition should be s32 for scalar, s1 for vector.
1169   getActionDefinitionsBuilder(G_SELECT)
1170     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1171           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1172           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1173     .clampScalar(0, S16, S64)
1174     .scalarize(1)
1175     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1176     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1177     .clampMaxNumElements(0, S32, 2)
1178     .clampMaxNumElements(0, LocalPtr, 2)
1179     .clampMaxNumElements(0, PrivatePtr, 2)
1180     .scalarize(0)
1181     .widenScalarToNextPow2(0)
1182     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1183 
1184   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1185   // be more flexible with the shift amount type.
1186   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1187     .legalFor({{S32, S32}, {S64, S32}});
1188   if (ST.has16BitInsts()) {
1189     if (ST.hasVOP3PInsts()) {
1190       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1191             .clampMaxNumElements(0, S16, 2);
1192     } else
1193       Shifts.legalFor({{S16, S16}});
1194 
1195     // TODO: Support 16-bit shift amounts for all types
1196     Shifts.widenScalarIf(
1197       [=](const LegalityQuery &Query) {
1198         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1199         // 32-bit amount.
1200         const LLT ValTy = Query.Types[0];
1201         const LLT AmountTy = Query.Types[1];
1202         return ValTy.getSizeInBits() <= 16 &&
1203                AmountTy.getSizeInBits() < 16;
1204       }, changeTo(1, S16));
1205     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1206     Shifts.clampScalar(1, S32, S32);
1207     Shifts.clampScalar(0, S16, S64);
1208     Shifts.widenScalarToNextPow2(0, 16);
1209   } else {
1210     // Make sure we legalize the shift amount type first, as the general
1211     // expansion for the shifted type will produce much worse code if it hasn't
1212     // been truncated already.
1213     Shifts.clampScalar(1, S32, S32);
1214     Shifts.clampScalar(0, S32, S64);
1215     Shifts.widenScalarToNextPow2(0, 32);
1216   }
1217   Shifts.scalarize(0);
1218 
1219   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1220     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1221     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1222     unsigned IdxTypeIdx = 2;
1223 
1224     getActionDefinitionsBuilder(Op)
1225       .customIf([=](const LegalityQuery &Query) {
1226           const LLT EltTy = Query.Types[EltTypeIdx];
1227           const LLT VecTy = Query.Types[VecTypeIdx];
1228           const LLT IdxTy = Query.Types[IdxTypeIdx];
1229           return (EltTy.getSizeInBits() == 16 ||
1230                   EltTy.getSizeInBits() % 32 == 0) &&
1231                  VecTy.getSizeInBits() % 32 == 0 &&
1232                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1233                  IdxTy.getSizeInBits() == 32;
1234         })
1235       .clampScalar(EltTypeIdx, S32, S64)
1236       .clampScalar(VecTypeIdx, S32, S64)
1237       .clampScalar(IdxTypeIdx, S32, S32);
1238   }
1239 
1240   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1241     .unsupportedIf([=](const LegalityQuery &Query) {
1242         const LLT &EltTy = Query.Types[1].getElementType();
1243         return Query.Types[0] != EltTy;
1244       });
1245 
1246   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1247     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1248     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1249 
1250     // FIXME: Doesn't handle extract of illegal sizes.
1251     getActionDefinitionsBuilder(Op)
1252       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1253       // FIXME: Multiples of 16 should not be legal.
1254       .legalIf([=](const LegalityQuery &Query) {
1255           const LLT BigTy = Query.Types[BigTyIdx];
1256           const LLT LitTy = Query.Types[LitTyIdx];
1257           return (BigTy.getSizeInBits() % 32 == 0) &&
1258                  (LitTy.getSizeInBits() % 16 == 0);
1259         })
1260       .widenScalarIf(
1261         [=](const LegalityQuery &Query) {
1262           const LLT BigTy = Query.Types[BigTyIdx];
1263           return (BigTy.getScalarSizeInBits() < 16);
1264         },
1265         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1266       .widenScalarIf(
1267         [=](const LegalityQuery &Query) {
1268           const LLT LitTy = Query.Types[LitTyIdx];
1269           return (LitTy.getScalarSizeInBits() < 16);
1270         },
1271         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1272       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1273       .widenScalarToNextPow2(BigTyIdx, 32);
1274 
1275   }
1276 
1277   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1278     .legalForCartesianProduct(AllS32Vectors, {S32})
1279     .legalForCartesianProduct(AllS64Vectors, {S64})
1280     .clampNumElements(0, V16S32, V32S32)
1281     .clampNumElements(0, V2S64, V16S64)
1282     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1283 
1284   if (ST.hasScalarPackInsts()) {
1285     BuildVector
1286       // FIXME: Should probably widen s1 vectors straight to s32
1287       .minScalarOrElt(0, S16)
1288       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1289       .minScalar(1, S32);
1290 
1291     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1292       .legalFor({V2S16, S32})
1293       .lower();
1294     BuildVector.minScalarOrElt(0, S32);
1295   } else {
1296     BuildVector.customFor({V2S16, S16});
1297     BuildVector.minScalarOrElt(0, S32);
1298 
1299     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1300       .customFor({V2S16, S32})
1301       .lower();
1302   }
1303 
1304   BuildVector.legalIf(isRegisterType(0));
1305 
1306   // FIXME: Clamp maximum size
1307   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1308     .legalIf(isRegisterType(0));
1309 
1310   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1311   // pre-legalize.
1312   if (ST.hasVOP3PInsts()) {
1313     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1314       .customFor({V2S16, V2S16})
1315       .lower();
1316   } else
1317     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1318 
1319   // Merge/Unmerge
1320   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1321     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1322     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1323 
1324     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1325       const LLT Ty = Query.Types[TypeIdx];
1326       if (Ty.isVector()) {
1327         const LLT &EltTy = Ty.getElementType();
1328         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1329           return true;
1330         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1331           return true;
1332       }
1333       return false;
1334     };
1335 
1336     auto &Builder = getActionDefinitionsBuilder(Op)
1337       .lowerFor({{S16, V2S16}})
1338       .lowerIf([=](const LegalityQuery &Query) {
1339           const LLT BigTy = Query.Types[BigTyIdx];
1340           return BigTy.getSizeInBits() == 32;
1341         })
1342       // Try to widen to s16 first for small types.
1343       // TODO: Only do this on targets with legal s16 shifts
1344       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1345       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1346       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1347       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1348                            elementTypeIs(1, S16)),
1349                        changeTo(1, V2S16))
1350       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1351       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1352       // valid.
1353       .clampScalar(LitTyIdx, S32, S512)
1354       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1355       // Break up vectors with weird elements into scalars
1356       .fewerElementsIf(
1357         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1358         scalarize(0))
1359       .fewerElementsIf(
1360         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1361         scalarize(1))
1362       .clampScalar(BigTyIdx, S32, MaxScalar);
1363 
1364     if (Op == G_MERGE_VALUES) {
1365       Builder.widenScalarIf(
1366         // TODO: Use 16-bit shifts if legal for 8-bit values?
1367         [=](const LegalityQuery &Query) {
1368           const LLT Ty = Query.Types[LitTyIdx];
1369           return Ty.getSizeInBits() < 32;
1370         },
1371         changeTo(LitTyIdx, S32));
1372     }
1373 
1374     Builder.widenScalarIf(
1375       [=](const LegalityQuery &Query) {
1376         const LLT Ty = Query.Types[BigTyIdx];
1377         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1378           Ty.getSizeInBits() % 16 != 0;
1379       },
1380       [=](const LegalityQuery &Query) {
1381         // Pick the next power of 2, or a multiple of 64 over 128.
1382         // Whichever is smaller.
1383         const LLT &Ty = Query.Types[BigTyIdx];
1384         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1385         if (NewSizeInBits >= 256) {
1386           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1387           if (RoundedTo < NewSizeInBits)
1388             NewSizeInBits = RoundedTo;
1389         }
1390         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1391       })
1392       .legalIf([=](const LegalityQuery &Query) {
1393           const LLT &BigTy = Query.Types[BigTyIdx];
1394           const LLT &LitTy = Query.Types[LitTyIdx];
1395 
1396           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1397             return false;
1398           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1399             return false;
1400 
1401           return BigTy.getSizeInBits() % 16 == 0 &&
1402                  LitTy.getSizeInBits() % 16 == 0 &&
1403                  BigTy.getSizeInBits() <= MaxRegisterSize;
1404         })
1405       // Any vectors left are the wrong size. Scalarize them.
1406       .scalarize(0)
1407       .scalarize(1);
1408   }
1409 
1410   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1411   // RegBankSelect.
1412   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1413     .legalFor({{S32}, {S64}});
1414 
1415   if (ST.hasVOP3PInsts()) {
1416     SextInReg.lowerFor({{V2S16}})
1417       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1418       // get more vector shift opportunities, since we'll get those when
1419       // expanded.
1420       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1421   } else if (ST.has16BitInsts()) {
1422     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1423   } else {
1424     // Prefer to promote to s32 before lowering if we don't have 16-bit
1425     // shifts. This avoid a lot of intermediate truncate and extend operations.
1426     SextInReg.lowerFor({{S32}, {S64}});
1427   }
1428 
1429   SextInReg
1430     .scalarize(0)
1431     .clampScalar(0, S32, S64)
1432     .lower();
1433 
1434   getActionDefinitionsBuilder(G_FSHR)
1435     .legalFor({{S32, S32}})
1436     .scalarize(0)
1437     .lower();
1438 
1439   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1440     .legalFor({S64});
1441 
1442   getActionDefinitionsBuilder({
1443       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1444       G_FCOPYSIGN,
1445 
1446       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1447       G_READ_REGISTER,
1448       G_WRITE_REGISTER,
1449 
1450       G_SADDO, G_SSUBO,
1451 
1452        // TODO: Implement
1453       G_FMINIMUM, G_FMAXIMUM,
1454       G_FSHL
1455     }).lower();
1456 
1457   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1458         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1459         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1460     .unsupported();
1461 
1462   computeTables();
1463   verify(*ST.getInstrInfo());
1464 }
1465 
1466 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1467                                          MachineRegisterInfo &MRI,
1468                                          MachineIRBuilder &B,
1469                                          GISelChangeObserver &Observer) const {
1470   switch (MI.getOpcode()) {
1471   case TargetOpcode::G_ADDRSPACE_CAST:
1472     return legalizeAddrSpaceCast(MI, MRI, B);
1473   case TargetOpcode::G_FRINT:
1474     return legalizeFrint(MI, MRI, B);
1475   case TargetOpcode::G_FCEIL:
1476     return legalizeFceil(MI, MRI, B);
1477   case TargetOpcode::G_INTRINSIC_TRUNC:
1478     return legalizeIntrinsicTrunc(MI, MRI, B);
1479   case TargetOpcode::G_SITOFP:
1480     return legalizeITOFP(MI, MRI, B, true);
1481   case TargetOpcode::G_UITOFP:
1482     return legalizeITOFP(MI, MRI, B, false);
1483   case TargetOpcode::G_FPTOSI:
1484     return legalizeFPTOI(MI, MRI, B, true);
1485   case TargetOpcode::G_FPTOUI:
1486     return legalizeFPTOI(MI, MRI, B, false);
1487   case TargetOpcode::G_FMINNUM:
1488   case TargetOpcode::G_FMAXNUM:
1489   case TargetOpcode::G_FMINNUM_IEEE:
1490   case TargetOpcode::G_FMAXNUM_IEEE:
1491     return legalizeMinNumMaxNum(MI, MRI, B);
1492   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1493     return legalizeExtractVectorElt(MI, MRI, B);
1494   case TargetOpcode::G_INSERT_VECTOR_ELT:
1495     return legalizeInsertVectorElt(MI, MRI, B);
1496   case TargetOpcode::G_SHUFFLE_VECTOR:
1497     return legalizeShuffleVector(MI, MRI, B);
1498   case TargetOpcode::G_FSIN:
1499   case TargetOpcode::G_FCOS:
1500     return legalizeSinCos(MI, MRI, B);
1501   case TargetOpcode::G_GLOBAL_VALUE:
1502     return legalizeGlobalValue(MI, MRI, B);
1503   case TargetOpcode::G_LOAD:
1504     return legalizeLoad(MI, MRI, B, Observer);
1505   case TargetOpcode::G_FMAD:
1506     return legalizeFMad(MI, MRI, B);
1507   case TargetOpcode::G_FDIV:
1508     return legalizeFDIV(MI, MRI, B);
1509   case TargetOpcode::G_UDIV:
1510   case TargetOpcode::G_UREM:
1511     return legalizeUDIV_UREM(MI, MRI, B);
1512   case TargetOpcode::G_SDIV:
1513   case TargetOpcode::G_SREM:
1514     return legalizeSDIV_SREM(MI, MRI, B);
1515   case TargetOpcode::G_ATOMIC_CMPXCHG:
1516     return legalizeAtomicCmpXChg(MI, MRI, B);
1517   case TargetOpcode::G_FLOG:
1518     return legalizeFlog(MI, B, numbers::ln2f);
1519   case TargetOpcode::G_FLOG10:
1520     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1521   case TargetOpcode::G_FEXP:
1522     return legalizeFExp(MI, B);
1523   case TargetOpcode::G_FPOW:
1524     return legalizeFPow(MI, B);
1525   case TargetOpcode::G_FFLOOR:
1526     return legalizeFFloor(MI, MRI, B);
1527   case TargetOpcode::G_BUILD_VECTOR:
1528     return legalizeBuildVector(MI, MRI, B);
1529   default:
1530     return false;
1531   }
1532 
1533   llvm_unreachable("expected switch to return");
1534 }
1535 
1536 Register AMDGPULegalizerInfo::getSegmentAperture(
1537   unsigned AS,
1538   MachineRegisterInfo &MRI,
1539   MachineIRBuilder &B) const {
1540   MachineFunction &MF = B.getMF();
1541   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1542   const LLT S32 = LLT::scalar(32);
1543 
1544   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1545 
1546   if (ST.hasApertureRegs()) {
1547     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1548     // getreg.
1549     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1550         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1551         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1552     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1553         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1554         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1555     unsigned Encoding =
1556         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1557         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1558         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1559 
1560     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1561 
1562     B.buildInstr(AMDGPU::S_GETREG_B32)
1563       .addDef(GetReg)
1564       .addImm(Encoding);
1565     MRI.setType(GetReg, S32);
1566 
1567     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1568     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1569   }
1570 
1571   Register QueuePtr = MRI.createGenericVirtualRegister(
1572     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1573 
1574   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1575   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1576     return Register();
1577 
1578   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1579   // private_segment_aperture_base_hi.
1580   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1581 
1582   // TODO: can we be smarter about machine pointer info?
1583   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1584   MachineMemOperand *MMO = MF.getMachineMemOperand(
1585       PtrInfo,
1586       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1587           MachineMemOperand::MOInvariant,
1588       4, commonAlignment(Align(64), StructOffset));
1589 
1590   Register LoadAddr;
1591 
1592   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1593   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1594 }
1595 
1596 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1597   MachineInstr &MI, MachineRegisterInfo &MRI,
1598   MachineIRBuilder &B) const {
1599   MachineFunction &MF = B.getMF();
1600 
1601   const LLT S32 = LLT::scalar(32);
1602   Register Dst = MI.getOperand(0).getReg();
1603   Register Src = MI.getOperand(1).getReg();
1604 
1605   LLT DstTy = MRI.getType(Dst);
1606   LLT SrcTy = MRI.getType(Src);
1607   unsigned DestAS = DstTy.getAddressSpace();
1608   unsigned SrcAS = SrcTy.getAddressSpace();
1609 
1610   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1611   // vector element.
1612   assert(!DstTy.isVector());
1613 
1614   const AMDGPUTargetMachine &TM
1615     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1616 
1617   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1618   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1619     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1620     return true;
1621   }
1622 
1623   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1624     // Truncate.
1625     B.buildExtract(Dst, Src, 0);
1626     MI.eraseFromParent();
1627     return true;
1628   }
1629 
1630   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1631     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1632     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1633 
1634     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1635     // another. Merge operands are required to be the same type, but creating an
1636     // extra ptrtoint would be kind of pointless.
1637     auto HighAddr = B.buildConstant(
1638       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1639     B.buildMerge(Dst, {Src, HighAddr});
1640     MI.eraseFromParent();
1641     return true;
1642   }
1643 
1644   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1645     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1646            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1647     unsigned NullVal = TM.getNullPointerValue(DestAS);
1648 
1649     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1650     auto FlatNull = B.buildConstant(SrcTy, 0);
1651 
1652     // Extract low 32-bits of the pointer.
1653     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1654 
1655     auto CmpRes =
1656         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1657     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1658 
1659     MI.eraseFromParent();
1660     return true;
1661   }
1662 
1663   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1664     return false;
1665 
1666   if (!ST.hasFlatAddressSpace())
1667     return false;
1668 
1669   auto SegmentNull =
1670       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1671   auto FlatNull =
1672       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1673 
1674   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1675   if (!ApertureReg.isValid())
1676     return false;
1677 
1678   auto CmpRes =
1679       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1680 
1681   // Coerce the type of the low half of the result so we can use merge_values.
1682   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1683 
1684   // TODO: Should we allow mismatched types but matching sizes in merges to
1685   // avoid the ptrtoint?
1686   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1687   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1688 
1689   MI.eraseFromParent();
1690   return true;
1691 }
1692 
1693 bool AMDGPULegalizerInfo::legalizeFrint(
1694   MachineInstr &MI, MachineRegisterInfo &MRI,
1695   MachineIRBuilder &B) const {
1696   Register Src = MI.getOperand(1).getReg();
1697   LLT Ty = MRI.getType(Src);
1698   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1699 
1700   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1701   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1702 
1703   auto C1 = B.buildFConstant(Ty, C1Val);
1704   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1705 
1706   // TODO: Should this propagate fast-math-flags?
1707   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1708   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1709 
1710   auto C2 = B.buildFConstant(Ty, C2Val);
1711   auto Fabs = B.buildFAbs(Ty, Src);
1712 
1713   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1714   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1715   return true;
1716 }
1717 
1718 bool AMDGPULegalizerInfo::legalizeFceil(
1719   MachineInstr &MI, MachineRegisterInfo &MRI,
1720   MachineIRBuilder &B) const {
1721 
1722   const LLT S1 = LLT::scalar(1);
1723   const LLT S64 = LLT::scalar(64);
1724 
1725   Register Src = MI.getOperand(1).getReg();
1726   assert(MRI.getType(Src) == S64);
1727 
1728   // result = trunc(src)
1729   // if (src > 0.0 && src != result)
1730   //   result += 1.0
1731 
1732   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1733 
1734   const auto Zero = B.buildFConstant(S64, 0.0);
1735   const auto One = B.buildFConstant(S64, 1.0);
1736   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1737   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1738   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1739   auto Add = B.buildSelect(S64, And, One, Zero);
1740 
1741   // TODO: Should this propagate fast-math-flags?
1742   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1743   return true;
1744 }
1745 
1746 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1747                                               MachineIRBuilder &B) {
1748   const unsigned FractBits = 52;
1749   const unsigned ExpBits = 11;
1750   LLT S32 = LLT::scalar(32);
1751 
1752   auto Const0 = B.buildConstant(S32, FractBits - 32);
1753   auto Const1 = B.buildConstant(S32, ExpBits);
1754 
1755   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1756     .addUse(Const0.getReg(0))
1757     .addUse(Const1.getReg(0));
1758 
1759   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1760 }
1761 
1762 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1763   MachineInstr &MI, MachineRegisterInfo &MRI,
1764   MachineIRBuilder &B) const {
1765   const LLT S1 = LLT::scalar(1);
1766   const LLT S32 = LLT::scalar(32);
1767   const LLT S64 = LLT::scalar(64);
1768 
1769   Register Src = MI.getOperand(1).getReg();
1770   assert(MRI.getType(Src) == S64);
1771 
1772   // TODO: Should this use extract since the low half is unused?
1773   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1774   Register Hi = Unmerge.getReg(1);
1775 
1776   // Extract the upper half, since this is where we will find the sign and
1777   // exponent.
1778   auto Exp = extractF64Exponent(Hi, B);
1779 
1780   const unsigned FractBits = 52;
1781 
1782   // Extract the sign bit.
1783   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1784   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1785 
1786   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1787 
1788   const auto Zero32 = B.buildConstant(S32, 0);
1789 
1790   // Extend back to 64-bits.
1791   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1792 
1793   auto Shr = B.buildAShr(S64, FractMask, Exp);
1794   auto Not = B.buildNot(S64, Shr);
1795   auto Tmp0 = B.buildAnd(S64, Src, Not);
1796   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1797 
1798   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1799   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1800 
1801   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1802   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1803   return true;
1804 }
1805 
1806 bool AMDGPULegalizerInfo::legalizeITOFP(
1807   MachineInstr &MI, MachineRegisterInfo &MRI,
1808   MachineIRBuilder &B, bool Signed) const {
1809 
1810   Register Dst = MI.getOperand(0).getReg();
1811   Register Src = MI.getOperand(1).getReg();
1812 
1813   const LLT S64 = LLT::scalar(64);
1814   const LLT S32 = LLT::scalar(32);
1815 
1816   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1817 
1818   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1819 
1820   auto CvtHi = Signed ?
1821     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1822     B.buildUITOFP(S64, Unmerge.getReg(1));
1823 
1824   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1825 
1826   auto ThirtyTwo = B.buildConstant(S32, 32);
1827   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1828     .addUse(CvtHi.getReg(0))
1829     .addUse(ThirtyTwo.getReg(0));
1830 
1831   // TODO: Should this propagate fast-math-flags?
1832   B.buildFAdd(Dst, LdExp, CvtLo);
1833   MI.eraseFromParent();
1834   return true;
1835 }
1836 
1837 // TODO: Copied from DAG implementation. Verify logic and document how this
1838 // actually works.
1839 bool AMDGPULegalizerInfo::legalizeFPTOI(
1840   MachineInstr &MI, MachineRegisterInfo &MRI,
1841   MachineIRBuilder &B, bool Signed) const {
1842 
1843   Register Dst = MI.getOperand(0).getReg();
1844   Register Src = MI.getOperand(1).getReg();
1845 
1846   const LLT S64 = LLT::scalar(64);
1847   const LLT S32 = LLT::scalar(32);
1848 
1849   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1850 
1851   unsigned Flags = MI.getFlags();
1852 
1853   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1854   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1855   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1856 
1857   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1858   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1859   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1860 
1861   auto Hi = Signed ?
1862     B.buildFPTOSI(S32, FloorMul) :
1863     B.buildFPTOUI(S32, FloorMul);
1864   auto Lo = B.buildFPTOUI(S32, Fma);
1865 
1866   B.buildMerge(Dst, { Lo, Hi });
1867   MI.eraseFromParent();
1868 
1869   return true;
1870 }
1871 
1872 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1873   MachineInstr &MI, MachineRegisterInfo &MRI,
1874   MachineIRBuilder &B) const {
1875   MachineFunction &MF = B.getMF();
1876   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1877 
1878   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1879                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1880 
1881   // With ieee_mode disabled, the instructions have the correct behavior
1882   // already for G_FMINNUM/G_FMAXNUM
1883   if (!MFI->getMode().IEEE)
1884     return !IsIEEEOp;
1885 
1886   if (IsIEEEOp)
1887     return true;
1888 
1889   MachineIRBuilder HelperBuilder(MI);
1890   GISelObserverWrapper DummyObserver;
1891   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1892   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1893 }
1894 
1895 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1896   MachineInstr &MI, MachineRegisterInfo &MRI,
1897   MachineIRBuilder &B) const {
1898   // TODO: Should move some of this into LegalizerHelper.
1899 
1900   // TODO: Promote dynamic indexing of s16 to s32
1901 
1902   // FIXME: Artifact combiner probably should have replaced the truncated
1903   // constant before this, so we shouldn't need
1904   // getConstantVRegValWithLookThrough.
1905   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1906     MI.getOperand(2).getReg(), MRI);
1907   if (!IdxVal) // Dynamic case will be selected to register indexing.
1908     return true;
1909 
1910   Register Dst = MI.getOperand(0).getReg();
1911   Register Vec = MI.getOperand(1).getReg();
1912 
1913   LLT VecTy = MRI.getType(Vec);
1914   LLT EltTy = VecTy.getElementType();
1915   assert(EltTy == MRI.getType(Dst));
1916 
1917   if (IdxVal->Value < VecTy.getNumElements())
1918     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1919   else
1920     B.buildUndef(Dst);
1921 
1922   MI.eraseFromParent();
1923   return true;
1924 }
1925 
1926 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1927   MachineInstr &MI, MachineRegisterInfo &MRI,
1928   MachineIRBuilder &B) const {
1929   // TODO: Should move some of this into LegalizerHelper.
1930 
1931   // TODO: Promote dynamic indexing of s16 to s32
1932 
1933   // FIXME: Artifact combiner probably should have replaced the truncated
1934   // constant before this, so we shouldn't need
1935   // getConstantVRegValWithLookThrough.
1936   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1937     MI.getOperand(3).getReg(), MRI);
1938   if (!IdxVal) // Dynamic case will be selected to register indexing.
1939     return true;
1940 
1941   Register Dst = MI.getOperand(0).getReg();
1942   Register Vec = MI.getOperand(1).getReg();
1943   Register Ins = MI.getOperand(2).getReg();
1944 
1945   LLT VecTy = MRI.getType(Vec);
1946   LLT EltTy = VecTy.getElementType();
1947   assert(EltTy == MRI.getType(Ins));
1948 
1949   if (IdxVal->Value < VecTy.getNumElements())
1950     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1951   else
1952     B.buildUndef(Dst);
1953 
1954   MI.eraseFromParent();
1955   return true;
1956 }
1957 
1958 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1959   MachineInstr &MI, MachineRegisterInfo &MRI,
1960   MachineIRBuilder &B) const {
1961   const LLT V2S16 = LLT::vector(2, 16);
1962 
1963   Register Dst = MI.getOperand(0).getReg();
1964   Register Src0 = MI.getOperand(1).getReg();
1965   LLT DstTy = MRI.getType(Dst);
1966   LLT SrcTy = MRI.getType(Src0);
1967 
1968   if (SrcTy == V2S16 && DstTy == V2S16 &&
1969       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1970     return true;
1971 
1972   MachineIRBuilder HelperBuilder(MI);
1973   GISelObserverWrapper DummyObserver;
1974   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1975   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1976 }
1977 
1978 bool AMDGPULegalizerInfo::legalizeSinCos(
1979   MachineInstr &MI, MachineRegisterInfo &MRI,
1980   MachineIRBuilder &B) const {
1981 
1982   Register DstReg = MI.getOperand(0).getReg();
1983   Register SrcReg = MI.getOperand(1).getReg();
1984   LLT Ty = MRI.getType(DstReg);
1985   unsigned Flags = MI.getFlags();
1986 
1987   Register TrigVal;
1988   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1989   if (ST.hasTrigReducedRange()) {
1990     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1991     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1992       .addUse(MulVal.getReg(0))
1993       .setMIFlags(Flags).getReg(0);
1994   } else
1995     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1996 
1997   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1998     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1999   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2000     .addUse(TrigVal)
2001     .setMIFlags(Flags);
2002   MI.eraseFromParent();
2003   return true;
2004 }
2005 
2006 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
2007   Register DstReg, LLT PtrTy,
2008   MachineIRBuilder &B, const GlobalValue *GV,
2009   unsigned Offset, unsigned GAFlags) const {
2010   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2011   // to the following code sequence:
2012   //
2013   // For constant address space:
2014   //   s_getpc_b64 s[0:1]
2015   //   s_add_u32 s0, s0, $symbol
2016   //   s_addc_u32 s1, s1, 0
2017   //
2018   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2019   //   a fixup or relocation is emitted to replace $symbol with a literal
2020   //   constant, which is a pc-relative offset from the encoding of the $symbol
2021   //   operand to the global variable.
2022   //
2023   // For global address space:
2024   //   s_getpc_b64 s[0:1]
2025   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2026   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2027   //
2028   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2029   //   fixups or relocations are emitted to replace $symbol@*@lo and
2030   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2031   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2032   //   operand to the global variable.
2033   //
2034   // What we want here is an offset from the value returned by s_getpc
2035   // (which is the address of the s_add_u32 instruction) to the global
2036   // variable, but since the encoding of $symbol starts 4 bytes after the start
2037   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2038   // small. This requires us to add 4 to the global variable offset in order to
2039   // compute the correct address.
2040 
2041   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2042 
2043   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2044     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2045 
2046   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2047     .addDef(PCReg);
2048 
2049   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2050   if (GAFlags == SIInstrInfo::MO_NONE)
2051     MIB.addImm(0);
2052   else
2053     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2054 
2055   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2056 
2057   if (PtrTy.getSizeInBits() == 32)
2058     B.buildExtract(DstReg, PCReg, 0);
2059   return true;
2060  }
2061 
2062 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2063   MachineInstr &MI, MachineRegisterInfo &MRI,
2064   MachineIRBuilder &B) const {
2065   Register DstReg = MI.getOperand(0).getReg();
2066   LLT Ty = MRI.getType(DstReg);
2067   unsigned AS = Ty.getAddressSpace();
2068 
2069   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2070   MachineFunction &MF = B.getMF();
2071   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2072 
2073   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2074     if (!MFI->isEntryFunction()) {
2075       const Function &Fn = MF.getFunction();
2076       DiagnosticInfoUnsupported BadLDSDecl(
2077         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2078         DS_Warning);
2079       Fn.getContext().diagnose(BadLDSDecl);
2080 
2081       // We currently don't have a way to correctly allocate LDS objects that
2082       // aren't directly associated with a kernel. We do force inlining of
2083       // functions that use local objects. However, if these dead functions are
2084       // not eliminated, we don't want a compile time error. Just emit a warning
2085       // and a trap, since there should be no callable path here.
2086       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2087       B.buildUndef(DstReg);
2088       MI.eraseFromParent();
2089       return true;
2090     }
2091 
2092     // TODO: We could emit code to handle the initialization somewhere.
2093     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2094       const SITargetLowering *TLI = ST.getTargetLowering();
2095       if (!TLI->shouldUseLDSConstAddress(GV)) {
2096         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2097         return true; // Leave in place;
2098       }
2099 
2100       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2101       MI.eraseFromParent();
2102       return true;
2103     }
2104 
2105     const Function &Fn = MF.getFunction();
2106     DiagnosticInfoUnsupported BadInit(
2107       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2108     Fn.getContext().diagnose(BadInit);
2109     return true;
2110   }
2111 
2112   const SITargetLowering *TLI = ST.getTargetLowering();
2113 
2114   if (TLI->shouldEmitFixup(GV)) {
2115     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2116     MI.eraseFromParent();
2117     return true;
2118   }
2119 
2120   if (TLI->shouldEmitPCReloc(GV)) {
2121     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2122     MI.eraseFromParent();
2123     return true;
2124   }
2125 
2126   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2127   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2128 
2129   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2130       MachinePointerInfo::getGOT(MF),
2131       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2132           MachineMemOperand::MOInvariant,
2133       8 /*Size*/, Align(8));
2134 
2135   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2136 
2137   if (Ty.getSizeInBits() == 32) {
2138     // Truncate if this is a 32-bit constant adrdess.
2139     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2140     B.buildExtract(DstReg, Load, 0);
2141   } else
2142     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2143 
2144   MI.eraseFromParent();
2145   return true;
2146 }
2147 
2148 bool AMDGPULegalizerInfo::legalizeLoad(
2149   MachineInstr &MI, MachineRegisterInfo &MRI,
2150   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2151   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2152   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2153   Observer.changingInstr(MI);
2154   MI.getOperand(1).setReg(Cast.getReg(0));
2155   Observer.changedInstr(MI);
2156   return true;
2157 }
2158 
2159 bool AMDGPULegalizerInfo::legalizeFMad(
2160   MachineInstr &MI, MachineRegisterInfo &MRI,
2161   MachineIRBuilder &B) const {
2162   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2163   assert(Ty.isScalar());
2164 
2165   MachineFunction &MF = B.getMF();
2166   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2167 
2168   // TODO: Always legal with future ftz flag.
2169   // FIXME: Do we need just output?
2170   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2171     return true;
2172   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2173     return true;
2174 
2175   MachineIRBuilder HelperBuilder(MI);
2176   GISelObserverWrapper DummyObserver;
2177   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2178   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2179 }
2180 
2181 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2182   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2183   Register DstReg = MI.getOperand(0).getReg();
2184   Register PtrReg = MI.getOperand(1).getReg();
2185   Register CmpVal = MI.getOperand(2).getReg();
2186   Register NewVal = MI.getOperand(3).getReg();
2187 
2188   assert(SITargetLowering::isFlatGlobalAddrSpace(
2189            MRI.getType(PtrReg).getAddressSpace()) &&
2190          "this should not have been custom lowered");
2191 
2192   LLT ValTy = MRI.getType(CmpVal);
2193   LLT VecTy = LLT::vector(2, ValTy);
2194 
2195   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2196 
2197   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2198     .addDef(DstReg)
2199     .addUse(PtrReg)
2200     .addUse(PackedVal)
2201     .setMemRefs(MI.memoperands());
2202 
2203   MI.eraseFromParent();
2204   return true;
2205 }
2206 
2207 bool AMDGPULegalizerInfo::legalizeFlog(
2208   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2209   Register Dst = MI.getOperand(0).getReg();
2210   Register Src = MI.getOperand(1).getReg();
2211   LLT Ty = B.getMRI()->getType(Dst);
2212   unsigned Flags = MI.getFlags();
2213 
2214   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2215   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2216 
2217   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2218   MI.eraseFromParent();
2219   return true;
2220 }
2221 
2222 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2223                                        MachineIRBuilder &B) const {
2224   Register Dst = MI.getOperand(0).getReg();
2225   Register Src = MI.getOperand(1).getReg();
2226   unsigned Flags = MI.getFlags();
2227   LLT Ty = B.getMRI()->getType(Dst);
2228 
2229   auto K = B.buildFConstant(Ty, numbers::log2e);
2230   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2231   B.buildFExp2(Dst, Mul, Flags);
2232   MI.eraseFromParent();
2233   return true;
2234 }
2235 
2236 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2237                                        MachineIRBuilder &B) const {
2238   Register Dst = MI.getOperand(0).getReg();
2239   Register Src0 = MI.getOperand(1).getReg();
2240   Register Src1 = MI.getOperand(2).getReg();
2241   unsigned Flags = MI.getFlags();
2242   LLT Ty = B.getMRI()->getType(Dst);
2243   const LLT S16 = LLT::scalar(16);
2244   const LLT S32 = LLT::scalar(32);
2245 
2246   if (Ty == S32) {
2247     auto Log = B.buildFLog2(S32, Src0, Flags);
2248     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2249       .addUse(Log.getReg(0))
2250       .addUse(Src1)
2251       .setMIFlags(Flags);
2252     B.buildFExp2(Dst, Mul, Flags);
2253   } else if (Ty == S16) {
2254     // There's no f16 fmul_legacy, so we need to convert for it.
2255     auto Log = B.buildFLog2(S16, Src0, Flags);
2256     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2257     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2258     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2259       .addUse(Ext0.getReg(0))
2260       .addUse(Ext1.getReg(0))
2261       .setMIFlags(Flags);
2262 
2263     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2264   } else
2265     return false;
2266 
2267   MI.eraseFromParent();
2268   return true;
2269 }
2270 
2271 // Find a source register, ignoring any possible source modifiers.
2272 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2273   Register ModSrc = OrigSrc;
2274   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2275     ModSrc = SrcFNeg->getOperand(1).getReg();
2276     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2277       ModSrc = SrcFAbs->getOperand(1).getReg();
2278   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2279     ModSrc = SrcFAbs->getOperand(1).getReg();
2280   return ModSrc;
2281 }
2282 
2283 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2284                                          MachineRegisterInfo &MRI,
2285                                          MachineIRBuilder &B) const {
2286 
2287   const LLT S1 = LLT::scalar(1);
2288   const LLT S64 = LLT::scalar(64);
2289   Register Dst = MI.getOperand(0).getReg();
2290   Register OrigSrc = MI.getOperand(1).getReg();
2291   unsigned Flags = MI.getFlags();
2292   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2293          "this should not have been custom lowered");
2294 
2295   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2296   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2297   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2298   // V_FRACT bug is:
2299   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2300   //
2301   // Convert floor(x) to (x - fract(x))
2302 
2303   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2304     .addUse(OrigSrc)
2305     .setMIFlags(Flags);
2306 
2307   // Give source modifier matching some assistance before obscuring a foldable
2308   // pattern.
2309 
2310   // TODO: We can avoid the neg on the fract? The input sign to fract
2311   // shouldn't matter?
2312   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2313 
2314   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2315 
2316   Register Min = MRI.createGenericVirtualRegister(S64);
2317 
2318   // We don't need to concern ourselves with the snan handling difference, so
2319   // use the one which will directly select.
2320   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2321   if (MFI->getMode().IEEE)
2322     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2323   else
2324     B.buildFMinNum(Min, Fract, Const, Flags);
2325 
2326   Register CorrectedFract = Min;
2327   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2328     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2329     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2330   }
2331 
2332   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2333   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2334 
2335   MI.eraseFromParent();
2336   return true;
2337 }
2338 
2339 // Turn an illegal packed v2s16 build vector into bit operations.
2340 // TODO: This should probably be a bitcast action in LegalizerHelper.
2341 bool AMDGPULegalizerInfo::legalizeBuildVector(
2342   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2343   Register Dst = MI.getOperand(0).getReg();
2344   const LLT S32 = LLT::scalar(32);
2345   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2346 
2347   Register Src0 = MI.getOperand(1).getReg();
2348   Register Src1 = MI.getOperand(2).getReg();
2349   assert(MRI.getType(Src0) == LLT::scalar(16));
2350 
2351   auto Merge = B.buildMerge(S32, {Src0, Src1});
2352   B.buildBitcast(Dst, Merge);
2353 
2354   MI.eraseFromParent();
2355   return true;
2356 }
2357 
2358 // Return the use branch instruction, otherwise null if the usage is invalid.
2359 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2360                                        MachineRegisterInfo &MRI,
2361                                        MachineInstr *&Br,
2362                                        MachineBasicBlock *&UncondBrTarget) {
2363   Register CondDef = MI.getOperand(0).getReg();
2364   if (!MRI.hasOneNonDBGUse(CondDef))
2365     return nullptr;
2366 
2367   MachineBasicBlock *Parent = MI.getParent();
2368   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2369   if (UseMI.getParent() != Parent ||
2370       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2371     return nullptr;
2372 
2373   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2374   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2375   if (Next == Parent->end()) {
2376     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2377     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2378       return nullptr;
2379     UncondBrTarget = &*NextMBB;
2380   } else {
2381     if (Next->getOpcode() != AMDGPU::G_BR)
2382       return nullptr;
2383     Br = &*Next;
2384     UncondBrTarget = Br->getOperand(0).getMBB();
2385   }
2386 
2387   return &UseMI;
2388 }
2389 
2390 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2391                                                MachineRegisterInfo &MRI,
2392                                                Register LiveIn,
2393                                                Register PhyReg) const {
2394   assert(PhyReg.isPhysical() && "Physical register expected");
2395 
2396   // Insert the live-in copy, if required, by defining destination virtual
2397   // register.
2398   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2399   if (!MRI.getVRegDef(LiveIn)) {
2400     // FIXME: Should have scoped insert pt
2401     MachineBasicBlock &OrigInsBB = B.getMBB();
2402     auto OrigInsPt = B.getInsertPt();
2403 
2404     MachineBasicBlock &EntryMBB = B.getMF().front();
2405     EntryMBB.addLiveIn(PhyReg);
2406     B.setInsertPt(EntryMBB, EntryMBB.begin());
2407     B.buildCopy(LiveIn, PhyReg);
2408 
2409     B.setInsertPt(OrigInsBB, OrigInsPt);
2410   }
2411 
2412   return LiveIn;
2413 }
2414 
2415 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2416                                                 MachineRegisterInfo &MRI,
2417                                                 Register PhyReg, LLT Ty,
2418                                                 bool InsertLiveInCopy) const {
2419   assert(PhyReg.isPhysical() && "Physical register expected");
2420 
2421   // Get or create virtual live-in regester
2422   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2423   if (!LiveIn) {
2424     LiveIn = MRI.createGenericVirtualRegister(Ty);
2425     MRI.addLiveIn(PhyReg, LiveIn);
2426   }
2427 
2428   // When the actual true copy required is from virtual register to physical
2429   // register (to be inserted later), live-in copy insertion from physical
2430   // to register virtual register is not required
2431   if (!InsertLiveInCopy)
2432     return LiveIn;
2433 
2434   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2435 }
2436 
2437 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2438     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2439   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2440   const ArgDescriptor *Arg;
2441   const TargetRegisterClass *RC;
2442   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2443   if (!Arg) {
2444     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2445     return nullptr;
2446   }
2447   return Arg;
2448 }
2449 
2450 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2451                                          const ArgDescriptor *Arg) const {
2452   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2453     return false; // TODO: Handle these
2454 
2455   Register SrcReg = Arg->getRegister();
2456   assert(SrcReg.isPhysical() && "Physical register expected");
2457   assert(DstReg.isVirtual() && "Virtual register expected");
2458 
2459   MachineRegisterInfo &MRI = *B.getMRI();
2460 
2461   LLT Ty = MRI.getType(DstReg);
2462   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2463 
2464   if (Arg->isMasked()) {
2465     // TODO: Should we try to emit this once in the entry block?
2466     const LLT S32 = LLT::scalar(32);
2467     const unsigned Mask = Arg->getMask();
2468     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2469 
2470     Register AndMaskSrc = LiveIn;
2471 
2472     if (Shift != 0) {
2473       auto ShiftAmt = B.buildConstant(S32, Shift);
2474       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2475     }
2476 
2477     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2478   } else {
2479     B.buildCopy(DstReg, LiveIn);
2480   }
2481 
2482   return true;
2483 }
2484 
2485 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2486     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2487     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2488 
2489   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2490   if (!Arg)
2491     return false;
2492 
2493   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2494     return false;
2495 
2496   MI.eraseFromParent();
2497   return true;
2498 }
2499 
2500 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2501                                        MachineRegisterInfo &MRI,
2502                                        MachineIRBuilder &B) const {
2503   Register Dst = MI.getOperand(0).getReg();
2504   LLT DstTy = MRI.getType(Dst);
2505   LLT S16 = LLT::scalar(16);
2506   LLT S32 = LLT::scalar(32);
2507   LLT S64 = LLT::scalar(64);
2508 
2509   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2510     return true;
2511 
2512   if (DstTy == S16)
2513     return legalizeFDIV16(MI, MRI, B);
2514   if (DstTy == S32)
2515     return legalizeFDIV32(MI, MRI, B);
2516   if (DstTy == S64)
2517     return legalizeFDIV64(MI, MRI, B);
2518 
2519   return false;
2520 }
2521 
2522 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2523   const LLT S32 = LLT::scalar(32);
2524 
2525   auto Cvt0 = B.buildUITOFP(S32, Src);
2526   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2527   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2528   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2529   return B.buildFPTOUI(S32, Mul).getReg(0);
2530 }
2531 
2532 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2533                                                   Register DstReg,
2534                                                   Register Num,
2535                                                   Register Den,
2536                                                   bool IsRem) const {
2537   const LLT S1 = LLT::scalar(1);
2538   const LLT S32 = LLT::scalar(32);
2539 
2540   // RCP =  URECIP(Den) = 2^32 / Den + e
2541   // e is rounding error.
2542   auto RCP = buildDivRCP(B, Den);
2543 
2544   // RCP_LO = mul(RCP, Den)
2545   auto RCP_LO = B.buildMul(S32, RCP, Den);
2546 
2547   // RCP_HI = mulhu (RCP, Den) */
2548   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2549 
2550   // NEG_RCP_LO = -RCP_LO
2551   auto Zero = B.buildConstant(S32, 0);
2552   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2553 
2554   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2555   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2556   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2557 
2558   // Calculate the rounding error from the URECIP instruction
2559   // E = mulhu(ABS_RCP_LO, RCP)
2560   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2561 
2562   // RCP_A_E = RCP + E
2563   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2564 
2565   // RCP_S_E = RCP - E
2566   auto RCP_S_E = B.buildSub(S32, RCP, E);
2567 
2568   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2569   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2570 
2571   // Quotient = mulhu(Tmp0, Num)stmp
2572   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2573 
2574   // Num_S_Remainder = Quotient * Den
2575   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2576 
2577   // Remainder = Num - Num_S_Remainder
2578   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2579 
2580   // Remainder_GE_Den = Remainder >= Den
2581   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2582 
2583   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2584   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2585                                        Num, Num_S_Remainder);
2586 
2587   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2588   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2589 
2590   // Calculate Division result:
2591 
2592   // Quotient_A_One = Quotient + 1
2593   auto One = B.buildConstant(S32, 1);
2594   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2595 
2596   // Quotient_S_One = Quotient - 1
2597   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2598 
2599   // Div = (Tmp1 ? Quotient_A_One : Quotient)
2600   auto Div = B.buildSelect(S32, Tmp1, Quotient_A_One, Quotient);
2601 
2602   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2603   if (IsRem) {
2604     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2605 
2606     // Calculate Rem result:
2607     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2608 
2609     // Remainder_A_Den = Remainder + Den
2610     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2611 
2612     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2613     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2614 
2615     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2616     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2617   } else {
2618     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2619   }
2620 }
2621 
2622 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2623                                               MachineRegisterInfo &MRI,
2624                                               MachineIRBuilder &B) const {
2625   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2626   Register DstReg = MI.getOperand(0).getReg();
2627   Register Num = MI.getOperand(1).getReg();
2628   Register Den = MI.getOperand(2).getReg();
2629   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2630   MI.eraseFromParent();
2631   return true;
2632 }
2633 
2634 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2635 //
2636 // Return lo, hi of result
2637 //
2638 // %cvt.lo = G_UITOFP Val.lo
2639 // %cvt.hi = G_UITOFP Val.hi
2640 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2641 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2642 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2643 // %mul2 = G_FMUL %mul1, 2**(-32)
2644 // %trunc = G_INTRINSIC_TRUNC %mul2
2645 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2646 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2647 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2648                                                        Register Val) {
2649   const LLT S32 = LLT::scalar(32);
2650   auto Unmerge = B.buildUnmerge(S32, Val);
2651 
2652   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2653   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2654 
2655   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2656                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2657 
2658   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2659   auto Mul1 =
2660       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2661 
2662   // 2**(-32)
2663   auto Mul2 =
2664       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2665   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2666 
2667   // -(2**32)
2668   auto Mad2 = B.buildFMAD(S32, Trunc,
2669                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2670 
2671   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2672   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2673 
2674   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2675 }
2676 
2677 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2678                                               MachineRegisterInfo &MRI,
2679                                               MachineIRBuilder &B) const {
2680   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2681   const LLT S32 = LLT::scalar(32);
2682   const LLT S64 = LLT::scalar(64);
2683   const LLT S1 = LLT::scalar(1);
2684   Register Numer = MI.getOperand(1).getReg();
2685   Register Denom = MI.getOperand(2).getReg();
2686   Register RcpLo, RcpHi;
2687 
2688   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2689 
2690   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2691 
2692   auto Zero64 = B.buildConstant(S64, 0);
2693   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2694 
2695   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2696   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2697 
2698   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2699   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2700   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2701 
2702   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2703   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2704   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2705   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2706 
2707   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2708   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2709   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2710   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2711   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2712 
2713   auto Zero32 = B.buildConstant(S32, 0);
2714   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2715   auto Add2_HiC =
2716       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2717   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2718   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2719 
2720   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2721   Register NumerLo = UnmergeNumer.getReg(0);
2722   Register NumerHi = UnmergeNumer.getReg(1);
2723 
2724   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2725   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2726   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2727   Register Mul3_Lo = UnmergeMul3.getReg(0);
2728   Register Mul3_Hi = UnmergeMul3.getReg(1);
2729   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2730   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2731   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2732   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2733 
2734   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2735   Register DenomLo = UnmergeDenom.getReg(0);
2736   Register DenomHi = UnmergeDenom.getReg(1);
2737 
2738   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2739   auto C1 = B.buildSExt(S32, CmpHi);
2740 
2741   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2742   auto C2 = B.buildSExt(S32, CmpLo);
2743 
2744   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2745   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2746 
2747   // TODO: Here and below portions of the code can be enclosed into if/endif.
2748   // Currently control flow is unconditional and we have 4 selects after
2749   // potential endif to substitute PHIs.
2750 
2751   // if C3 != 0 ...
2752   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2753   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2754   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2755   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2756 
2757   auto One64 = B.buildConstant(S64, 1);
2758   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2759 
2760   auto C4 =
2761       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2762   auto C5 =
2763       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2764   auto C6 = B.buildSelect(
2765       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2766 
2767   // if (C6 != 0)
2768   auto Add4 = B.buildAdd(S64, Add3, One64);
2769   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2770 
2771   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2772   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2773   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2774 
2775   // endif C6
2776   // endif C3
2777 
2778   if (IsDiv) {
2779     auto Sel1 = B.buildSelect(
2780         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2781     B.buildSelect(MI.getOperand(0),
2782                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2783   } else {
2784     auto Sel2 = B.buildSelect(
2785         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2786     B.buildSelect(MI.getOperand(0),
2787                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2788   }
2789 
2790   MI.eraseFromParent();
2791   return true;
2792 }
2793 
2794 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2795                                             MachineRegisterInfo &MRI,
2796                                             MachineIRBuilder &B) const {
2797   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2798   if (Ty == LLT::scalar(32))
2799     return legalizeUDIV_UREM32(MI, MRI, B);
2800   if (Ty == LLT::scalar(64))
2801     return legalizeUDIV_UREM64(MI, MRI, B);
2802   return false;
2803 }
2804 
2805 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2806                                               MachineRegisterInfo &MRI,
2807                                               MachineIRBuilder &B) const {
2808   const LLT S32 = LLT::scalar(32);
2809 
2810   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2811   Register DstReg = MI.getOperand(0).getReg();
2812   Register LHS = MI.getOperand(1).getReg();
2813   Register RHS = MI.getOperand(2).getReg();
2814 
2815   auto ThirtyOne = B.buildConstant(S32, 31);
2816   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2817   auto RHSign = B.buildAShr(S32, RHS, ThirtyOne);
2818 
2819   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2820   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2821 
2822   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2823   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2824 
2825   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2826   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2827 
2828   if (IsRem) {
2829     auto RSign = LHSign; // Remainder sign is the same as LHS
2830     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2831     B.buildSub(DstReg, UDivRem, RSign);
2832   } else {
2833     auto DSign = B.buildXor(S32, LHSign, RHSign);
2834     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2835     B.buildSub(DstReg, UDivRem, DSign);
2836   }
2837 
2838   MI.eraseFromParent();
2839   return true;
2840 }
2841 
2842 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2843                                             MachineRegisterInfo &MRI,
2844                                             MachineIRBuilder &B) const {
2845   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2846     return legalizeSDIV_SREM32(MI, MRI, B);
2847   return false;
2848 }
2849 
2850 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2851                                                  MachineRegisterInfo &MRI,
2852                                                  MachineIRBuilder &B) const {
2853   Register Res = MI.getOperand(0).getReg();
2854   Register LHS = MI.getOperand(1).getReg();
2855   Register RHS = MI.getOperand(2).getReg();
2856 
2857   uint16_t Flags = MI.getFlags();
2858 
2859   LLT ResTy = MRI.getType(Res);
2860   LLT S32 = LLT::scalar(32);
2861   LLT S64 = LLT::scalar(64);
2862 
2863   const MachineFunction &MF = B.getMF();
2864   bool Unsafe =
2865     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2866 
2867   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2868     return false;
2869 
2870   if (!Unsafe && ResTy == S32 &&
2871       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2872     return false;
2873 
2874   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2875     // 1 / x -> RCP(x)
2876     if (CLHS->isExactlyValue(1.0)) {
2877       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2878         .addUse(RHS)
2879         .setMIFlags(Flags);
2880 
2881       MI.eraseFromParent();
2882       return true;
2883     }
2884 
2885     // -1 / x -> RCP( FNEG(x) )
2886     if (CLHS->isExactlyValue(-1.0)) {
2887       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2888       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2889         .addUse(FNeg.getReg(0))
2890         .setMIFlags(Flags);
2891 
2892       MI.eraseFromParent();
2893       return true;
2894     }
2895   }
2896 
2897   // x / y -> x * (1.0 / y)
2898   if (Unsafe) {
2899     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2900       .addUse(RHS)
2901       .setMIFlags(Flags);
2902     B.buildFMul(Res, LHS, RCP, Flags);
2903 
2904     MI.eraseFromParent();
2905     return true;
2906   }
2907 
2908   return false;
2909 }
2910 
2911 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2912                                          MachineRegisterInfo &MRI,
2913                                          MachineIRBuilder &B) const {
2914   Register Res = MI.getOperand(0).getReg();
2915   Register LHS = MI.getOperand(1).getReg();
2916   Register RHS = MI.getOperand(2).getReg();
2917 
2918   uint16_t Flags = MI.getFlags();
2919 
2920   LLT S16 = LLT::scalar(16);
2921   LLT S32 = LLT::scalar(32);
2922 
2923   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2924   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2925 
2926   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2927     .addUse(RHSExt.getReg(0))
2928     .setMIFlags(Flags);
2929 
2930   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2931   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2932 
2933   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2934     .addUse(RDst.getReg(0))
2935     .addUse(RHS)
2936     .addUse(LHS)
2937     .setMIFlags(Flags);
2938 
2939   MI.eraseFromParent();
2940   return true;
2941 }
2942 
2943 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2944 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2945 static void toggleSPDenormMode(bool Enable,
2946                                MachineIRBuilder &B,
2947                                const GCNSubtarget &ST,
2948                                AMDGPU::SIModeRegisterDefaults Mode) {
2949   // Set SP denorm mode to this value.
2950   unsigned SPDenormMode =
2951     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2952 
2953   if (ST.hasDenormModeInst()) {
2954     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2955     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2956 
2957     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2958     B.buildInstr(AMDGPU::S_DENORM_MODE)
2959       .addImm(NewDenormModeValue);
2960 
2961   } else {
2962     // Select FP32 bit field in mode register.
2963     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2964                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2965                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2966 
2967     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2968       .addImm(SPDenormMode)
2969       .addImm(SPDenormModeBitField);
2970   }
2971 }
2972 
2973 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2974                                          MachineRegisterInfo &MRI,
2975                                          MachineIRBuilder &B) const {
2976   Register Res = MI.getOperand(0).getReg();
2977   Register LHS = MI.getOperand(1).getReg();
2978   Register RHS = MI.getOperand(2).getReg();
2979   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2980   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2981 
2982   uint16_t Flags = MI.getFlags();
2983 
2984   LLT S32 = LLT::scalar(32);
2985   LLT S1 = LLT::scalar(1);
2986 
2987   auto One = B.buildFConstant(S32, 1.0f);
2988 
2989   auto DenominatorScaled =
2990     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2991       .addUse(LHS)
2992       .addUse(RHS)
2993       .addImm(0)
2994       .setMIFlags(Flags);
2995   auto NumeratorScaled =
2996     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2997       .addUse(LHS)
2998       .addUse(RHS)
2999       .addImm(1)
3000       .setMIFlags(Flags);
3001 
3002   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3003     .addUse(DenominatorScaled.getReg(0))
3004     .setMIFlags(Flags);
3005   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3006 
3007   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3008   // aren't modeled as reading it.
3009   if (!Mode.allFP32Denormals())
3010     toggleSPDenormMode(true, B, ST, Mode);
3011 
3012   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3013   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3014   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3015   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3016   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3017   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3018 
3019   if (!Mode.allFP32Denormals())
3020     toggleSPDenormMode(false, B, ST, Mode);
3021 
3022   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3023     .addUse(Fma4.getReg(0))
3024     .addUse(Fma1.getReg(0))
3025     .addUse(Fma3.getReg(0))
3026     .addUse(NumeratorScaled.getReg(1))
3027     .setMIFlags(Flags);
3028 
3029   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3030     .addUse(Fmas.getReg(0))
3031     .addUse(RHS)
3032     .addUse(LHS)
3033     .setMIFlags(Flags);
3034 
3035   MI.eraseFromParent();
3036   return true;
3037 }
3038 
3039 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3040                                          MachineRegisterInfo &MRI,
3041                                          MachineIRBuilder &B) const {
3042   Register Res = MI.getOperand(0).getReg();
3043   Register LHS = MI.getOperand(1).getReg();
3044   Register RHS = MI.getOperand(2).getReg();
3045 
3046   uint16_t Flags = MI.getFlags();
3047 
3048   LLT S64 = LLT::scalar(64);
3049   LLT S1 = LLT::scalar(1);
3050 
3051   auto One = B.buildFConstant(S64, 1.0);
3052 
3053   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3054     .addUse(LHS)
3055     .addUse(RHS)
3056     .addImm(0)
3057     .setMIFlags(Flags);
3058 
3059   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3060 
3061   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3062     .addUse(DivScale0.getReg(0))
3063     .setMIFlags(Flags);
3064 
3065   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3066   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3067   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3068 
3069   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3070     .addUse(LHS)
3071     .addUse(RHS)
3072     .addImm(1)
3073     .setMIFlags(Flags);
3074 
3075   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3076   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3077   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3078 
3079   Register Scale;
3080   if (!ST.hasUsableDivScaleConditionOutput()) {
3081     // Workaround a hardware bug on SI where the condition output from div_scale
3082     // is not usable.
3083 
3084     LLT S32 = LLT::scalar(32);
3085 
3086     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3087     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3088     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3089     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3090 
3091     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3092                               Scale1Unmerge.getReg(1));
3093     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3094                               Scale0Unmerge.getReg(1));
3095     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3096   } else {
3097     Scale = DivScale1.getReg(1);
3098   }
3099 
3100   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3101     .addUse(Fma4.getReg(0))
3102     .addUse(Fma3.getReg(0))
3103     .addUse(Mul.getReg(0))
3104     .addUse(Scale)
3105     .setMIFlags(Flags);
3106 
3107   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3108     .addUse(Fmas.getReg(0))
3109     .addUse(RHS)
3110     .addUse(LHS)
3111     .setMIFlags(Flags);
3112 
3113   MI.eraseFromParent();
3114   return true;
3115 }
3116 
3117 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3118                                                  MachineRegisterInfo &MRI,
3119                                                  MachineIRBuilder &B) const {
3120   Register Res = MI.getOperand(0).getReg();
3121   Register LHS = MI.getOperand(2).getReg();
3122   Register RHS = MI.getOperand(3).getReg();
3123   uint16_t Flags = MI.getFlags();
3124 
3125   LLT S32 = LLT::scalar(32);
3126   LLT S1 = LLT::scalar(1);
3127 
3128   auto Abs = B.buildFAbs(S32, RHS, Flags);
3129   const APFloat C0Val(1.0f);
3130 
3131   auto C0 = B.buildConstant(S32, 0x6f800000);
3132   auto C1 = B.buildConstant(S32, 0x2f800000);
3133   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3134 
3135   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3136   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3137 
3138   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3139 
3140   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3141     .addUse(Mul0.getReg(0))
3142     .setMIFlags(Flags);
3143 
3144   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3145 
3146   B.buildFMul(Res, Sel, Mul1, Flags);
3147 
3148   MI.eraseFromParent();
3149   return true;
3150 }
3151 
3152 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3153                                                  MachineRegisterInfo &MRI,
3154                                                  MachineIRBuilder &B) const {
3155   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3156   if (!MFI->isEntryFunction()) {
3157     return legalizePreloadedArgIntrin(MI, MRI, B,
3158                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3159   }
3160 
3161   uint64_t Offset =
3162     ST.getTargetLowering()->getImplicitParameterOffset(
3163       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3164   Register DstReg = MI.getOperand(0).getReg();
3165   LLT DstTy = MRI.getType(DstReg);
3166   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3167 
3168   const ArgDescriptor *Arg;
3169   const TargetRegisterClass *RC;
3170   std::tie(Arg, RC)
3171     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3172   if (!Arg)
3173     return false;
3174 
3175   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3176   if (!loadInputValue(KernargPtrReg, B, Arg))
3177     return false;
3178 
3179   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3180   MI.eraseFromParent();
3181   return true;
3182 }
3183 
3184 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3185                                               MachineRegisterInfo &MRI,
3186                                               MachineIRBuilder &B,
3187                                               unsigned AddrSpace) const {
3188   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3189   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3190   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3191   MI.eraseFromParent();
3192   return true;
3193 }
3194 
3195 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3196 // offset (the offset that is included in bounds checking and swizzling, to be
3197 // split between the instruction's voffset and immoffset fields) and soffset
3198 // (the offset that is excluded from bounds checking and swizzling, to go in
3199 // the instruction's soffset field).  This function takes the first kind of
3200 // offset and figures out how to split it between voffset and immoffset.
3201 std::tuple<Register, unsigned, unsigned>
3202 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3203                                         Register OrigOffset) const {
3204   const unsigned MaxImm = 4095;
3205   Register BaseReg;
3206   unsigned TotalConstOffset;
3207   MachineInstr *OffsetDef;
3208   const LLT S32 = LLT::scalar(32);
3209 
3210   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3211     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3212 
3213   unsigned ImmOffset = TotalConstOffset;
3214 
3215   // If the immediate value is too big for the immoffset field, put the value
3216   // and -4096 into the immoffset field so that the value that is copied/added
3217   // for the voffset field is a multiple of 4096, and it stands more chance
3218   // of being CSEd with the copy/add for another similar load/store.
3219   // However, do not do that rounding down to a multiple of 4096 if that is a
3220   // negative number, as it appears to be illegal to have a negative offset
3221   // in the vgpr, even if adding the immediate offset makes it positive.
3222   unsigned Overflow = ImmOffset & ~MaxImm;
3223   ImmOffset -= Overflow;
3224   if ((int32_t)Overflow < 0) {
3225     Overflow += ImmOffset;
3226     ImmOffset = 0;
3227   }
3228 
3229   if (Overflow != 0) {
3230     if (!BaseReg) {
3231       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3232     } else {
3233       auto OverflowVal = B.buildConstant(S32, Overflow);
3234       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3235     }
3236   }
3237 
3238   if (!BaseReg)
3239     BaseReg = B.buildConstant(S32, 0).getReg(0);
3240 
3241   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3242 }
3243 
3244 /// Handle register layout difference for f16 images for some subtargets.
3245 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3246                                              MachineRegisterInfo &MRI,
3247                                              Register Reg) const {
3248   if (!ST.hasUnpackedD16VMem())
3249     return Reg;
3250 
3251   const LLT S16 = LLT::scalar(16);
3252   const LLT S32 = LLT::scalar(32);
3253   LLT StoreVT = MRI.getType(Reg);
3254   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3255 
3256   auto Unmerge = B.buildUnmerge(S16, Reg);
3257 
3258   SmallVector<Register, 4> WideRegs;
3259   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3260     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3261 
3262   int NumElts = StoreVT.getNumElements();
3263 
3264   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3265 }
3266 
3267 Register AMDGPULegalizerInfo::fixStoreSourceType(
3268   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3269   MachineRegisterInfo *MRI = B.getMRI();
3270   LLT Ty = MRI->getType(VData);
3271 
3272   const LLT S16 = LLT::scalar(16);
3273 
3274   // Fixup illegal register types for i8 stores.
3275   if (Ty == LLT::scalar(8) || Ty == S16) {
3276     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3277     return AnyExt;
3278   }
3279 
3280   if (Ty.isVector()) {
3281     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3282       if (IsFormat)
3283         return handleD16VData(B, *MRI, VData);
3284     }
3285   }
3286 
3287   return VData;
3288 }
3289 
3290 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3291                                               MachineRegisterInfo &MRI,
3292                                               MachineIRBuilder &B,
3293                                               bool IsTyped,
3294                                               bool IsFormat) const {
3295   Register VData = MI.getOperand(1).getReg();
3296   LLT Ty = MRI.getType(VData);
3297   LLT EltTy = Ty.getScalarType();
3298   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3299   const LLT S32 = LLT::scalar(32);
3300 
3301   VData = fixStoreSourceType(B, VData, IsFormat);
3302   Register RSrc = MI.getOperand(2).getReg();
3303 
3304   MachineMemOperand *MMO = *MI.memoperands_begin();
3305   const int MemSize = MMO->getSize();
3306 
3307   unsigned ImmOffset;
3308   unsigned TotalOffset;
3309 
3310   // The typed intrinsics add an immediate after the registers.
3311   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3312 
3313   // The struct intrinsic variants add one additional operand over raw.
3314   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3315   Register VIndex;
3316   int OpOffset = 0;
3317   if (HasVIndex) {
3318     VIndex = MI.getOperand(3).getReg();
3319     OpOffset = 1;
3320   }
3321 
3322   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3323   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3324 
3325   unsigned Format = 0;
3326   if (IsTyped) {
3327     Format = MI.getOperand(5 + OpOffset).getImm();
3328     ++OpOffset;
3329   }
3330 
3331   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3332 
3333   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3334   if (TotalOffset != 0)
3335     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3336 
3337   unsigned Opc;
3338   if (IsTyped) {
3339     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3340                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3341   } else if (IsFormat) {
3342     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3343                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3344   } else {
3345     switch (MemSize) {
3346     case 1:
3347       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3348       break;
3349     case 2:
3350       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3351       break;
3352     default:
3353       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3354       break;
3355     }
3356   }
3357 
3358   if (!VIndex)
3359     VIndex = B.buildConstant(S32, 0).getReg(0);
3360 
3361   auto MIB = B.buildInstr(Opc)
3362     .addUse(VData)              // vdata
3363     .addUse(RSrc)               // rsrc
3364     .addUse(VIndex)             // vindex
3365     .addUse(VOffset)            // voffset
3366     .addUse(SOffset)            // soffset
3367     .addImm(ImmOffset);         // offset(imm)
3368 
3369   if (IsTyped)
3370     MIB.addImm(Format);
3371 
3372   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3373      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3374      .addMemOperand(MMO);
3375 
3376   MI.eraseFromParent();
3377   return true;
3378 }
3379 
3380 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3381                                              MachineRegisterInfo &MRI,
3382                                              MachineIRBuilder &B,
3383                                              bool IsFormat,
3384                                              bool IsTyped) const {
3385   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3386   MachineMemOperand *MMO = *MI.memoperands_begin();
3387   const int MemSize = MMO->getSize();
3388   const LLT S32 = LLT::scalar(32);
3389 
3390   Register Dst = MI.getOperand(0).getReg();
3391   Register RSrc = MI.getOperand(2).getReg();
3392 
3393   // The typed intrinsics add an immediate after the registers.
3394   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3395 
3396   // The struct intrinsic variants add one additional operand over raw.
3397   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3398   Register VIndex;
3399   int OpOffset = 0;
3400   if (HasVIndex) {
3401     VIndex = MI.getOperand(3).getReg();
3402     OpOffset = 1;
3403   }
3404 
3405   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3406   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3407 
3408   unsigned Format = 0;
3409   if (IsTyped) {
3410     Format = MI.getOperand(5 + OpOffset).getImm();
3411     ++OpOffset;
3412   }
3413 
3414   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3415   unsigned ImmOffset;
3416   unsigned TotalOffset;
3417 
3418   LLT Ty = MRI.getType(Dst);
3419   LLT EltTy = Ty.getScalarType();
3420   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3421   const bool Unpacked = ST.hasUnpackedD16VMem();
3422 
3423   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3424   if (TotalOffset != 0)
3425     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3426 
3427   unsigned Opc;
3428 
3429   if (IsTyped) {
3430     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3431                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3432   } else if (IsFormat) {
3433     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3434                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3435   } else {
3436     switch (MemSize) {
3437     case 1:
3438       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3439       break;
3440     case 2:
3441       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3442       break;
3443     default:
3444       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3445       break;
3446     }
3447   }
3448 
3449   Register LoadDstReg;
3450 
3451   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3452   LLT UnpackedTy = Ty.changeElementSize(32);
3453 
3454   if (IsExtLoad)
3455     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3456   else if (Unpacked && IsD16 && Ty.isVector())
3457     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3458   else
3459     LoadDstReg = Dst;
3460 
3461   if (!VIndex)
3462     VIndex = B.buildConstant(S32, 0).getReg(0);
3463 
3464   auto MIB = B.buildInstr(Opc)
3465     .addDef(LoadDstReg)         // vdata
3466     .addUse(RSrc)               // rsrc
3467     .addUse(VIndex)             // vindex
3468     .addUse(VOffset)            // voffset
3469     .addUse(SOffset)            // soffset
3470     .addImm(ImmOffset);         // offset(imm)
3471 
3472   if (IsTyped)
3473     MIB.addImm(Format);
3474 
3475   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3476      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3477      .addMemOperand(MMO);
3478 
3479   if (LoadDstReg != Dst) {
3480     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3481 
3482     // Widen result for extending loads was widened.
3483     if (IsExtLoad)
3484       B.buildTrunc(Dst, LoadDstReg);
3485     else {
3486       // Repack to original 16-bit vector result
3487       // FIXME: G_TRUNC should work, but legalization currently fails
3488       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3489       SmallVector<Register, 4> Repack;
3490       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3491         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3492       B.buildMerge(Dst, Repack);
3493     }
3494   }
3495 
3496   MI.eraseFromParent();
3497   return true;
3498 }
3499 
3500 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3501                                                MachineIRBuilder &B,
3502                                                bool IsInc) const {
3503   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3504                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3505   B.buildInstr(Opc)
3506     .addDef(MI.getOperand(0).getReg())
3507     .addUse(MI.getOperand(2).getReg())
3508     .addUse(MI.getOperand(3).getReg())
3509     .cloneMemRefs(MI);
3510   MI.eraseFromParent();
3511   return true;
3512 }
3513 
3514 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3515   switch (IntrID) {
3516   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3517   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3518     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3519   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3520   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3521     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3522   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3523   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3524     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3525   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3526   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3527     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3528   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3529   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3530     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3531   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3532   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3533     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3534   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3535   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3536     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3537   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3538   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3539     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3540   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3541   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3542     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3543   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3544   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3545     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3546   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3547   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3548     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3549   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3550   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3551     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3552   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3553   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3554     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3555   default:
3556     llvm_unreachable("unhandled atomic opcode");
3557   }
3558 }
3559 
3560 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3561                                                MachineIRBuilder &B,
3562                                                Intrinsic::ID IID) const {
3563   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3564                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3565 
3566   Register Dst = MI.getOperand(0).getReg();
3567   Register VData = MI.getOperand(2).getReg();
3568 
3569   Register CmpVal;
3570   int OpOffset = 0;
3571 
3572   if (IsCmpSwap) {
3573     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3574     ++OpOffset;
3575   }
3576 
3577   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3578   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3579 
3580   // The struct intrinsic variants add one additional operand over raw.
3581   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3582   Register VIndex;
3583   if (HasVIndex) {
3584     VIndex = MI.getOperand(4 + OpOffset).getReg();
3585     ++OpOffset;
3586   }
3587 
3588   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3589   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3590   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3591 
3592   MachineMemOperand *MMO = *MI.memoperands_begin();
3593 
3594   unsigned ImmOffset;
3595   unsigned TotalOffset;
3596   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3597   if (TotalOffset != 0)
3598     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3599 
3600   if (!VIndex)
3601     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3602 
3603   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3604     .addDef(Dst)
3605     .addUse(VData); // vdata
3606 
3607   if (IsCmpSwap)
3608     MIB.addReg(CmpVal);
3609 
3610   MIB.addUse(RSrc)               // rsrc
3611      .addUse(VIndex)             // vindex
3612      .addUse(VOffset)            // voffset
3613      .addUse(SOffset)            // soffset
3614      .addImm(ImmOffset)          // offset(imm)
3615      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3616      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3617      .addMemOperand(MMO);
3618 
3619   MI.eraseFromParent();
3620   return true;
3621 }
3622 
3623 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3624 /// vector with s16 typed elements.
3625 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3626                                         SmallVectorImpl<Register> &PackedAddrs,
3627                                         int AddrIdx, int DimIdx, int EndIdx,
3628                                         int NumGradients) {
3629   const LLT S16 = LLT::scalar(16);
3630   const LLT V2S16 = LLT::vector(2, 16);
3631 
3632   for (int I = AddrIdx; I < EndIdx; ++I) {
3633     MachineOperand &SrcOp = MI.getOperand(I);
3634     if (!SrcOp.isReg())
3635       continue; // _L to _LZ may have eliminated this.
3636 
3637     Register AddrReg = SrcOp.getReg();
3638 
3639     if (I < DimIdx) {
3640       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3641       PackedAddrs.push_back(AddrReg);
3642     } else {
3643       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3644       // derivatives dx/dh and dx/dv are packed with undef.
3645       if (((I + 1) >= EndIdx) ||
3646           ((NumGradients / 2) % 2 == 1 &&
3647            (I == DimIdx + (NumGradients / 2) - 1 ||
3648             I == DimIdx + NumGradients - 1)) ||
3649           // Check for _L to _LZ optimization
3650           !MI.getOperand(I + 1).isReg()) {
3651         PackedAddrs.push_back(
3652             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3653                 .getReg(0));
3654       } else {
3655         PackedAddrs.push_back(
3656             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3657                 .getReg(0));
3658         ++I;
3659       }
3660     }
3661   }
3662 }
3663 
3664 /// Convert from separate vaddr components to a single vector address register,
3665 /// and replace the remaining operands with $noreg.
3666 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3667                                      int DimIdx, int NumVAddrs) {
3668   const LLT S32 = LLT::scalar(32);
3669 
3670   SmallVector<Register, 8> AddrRegs;
3671   for (int I = 0; I != NumVAddrs; ++I) {
3672     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3673     if (SrcOp.isReg()) {
3674       AddrRegs.push_back(SrcOp.getReg());
3675       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3676     }
3677   }
3678 
3679   int NumAddrRegs = AddrRegs.size();
3680   if (NumAddrRegs != 1) {
3681     // Round up to 8 elements for v5-v7
3682     // FIXME: Missing intermediate sized register classes and instructions.
3683     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3684       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3685       auto Undef = B.buildUndef(S32);
3686       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3687       NumAddrRegs = RoundedNumRegs;
3688     }
3689 
3690     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3691     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3692   }
3693 
3694   for (int I = 1; I != NumVAddrs; ++I) {
3695     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3696     if (SrcOp.isReg())
3697       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3698   }
3699 }
3700 
3701 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3702 ///
3703 /// Depending on the subtarget, load/store with 16-bit element data need to be
3704 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3705 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3706 /// registers.
3707 ///
3708 /// We don't want to directly select image instructions just yet, but also want
3709 /// to exposes all register repacking to the legalizer/combiners. We also don't
3710 /// want a selected instrution entering RegBankSelect. In order to avoid
3711 /// defining a multitude of intermediate image instructions, directly hack on
3712 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3713 /// now unnecessary arguments with $noreg.
3714 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3715     MachineInstr &MI, MachineIRBuilder &B,
3716     GISelChangeObserver &Observer,
3717     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3718 
3719   const int NumDefs = MI.getNumExplicitDefs();
3720   bool IsTFE = NumDefs == 2;
3721   // We are only processing the operands of d16 image operations on subtargets
3722   // that use the unpacked register layout, or need to repack the TFE result.
3723 
3724   // TODO: Do we need to guard against already legalized intrinsics?
3725   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3726     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3727 
3728   MachineRegisterInfo *MRI = B.getMRI();
3729   const LLT S32 = LLT::scalar(32);
3730   const LLT S16 = LLT::scalar(16);
3731   const LLT V2S16 = LLT::vector(2, 16);
3732 
3733   // Index of first address argument
3734   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3735 
3736   int NumVAddrs, NumGradients;
3737   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3738   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3739     getDMaskIdx(BaseOpcode, NumDefs);
3740   unsigned DMask = 0;
3741 
3742   // Check for 16 bit addresses and pack if true.
3743   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3744   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3745   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3746   const bool IsG16 = GradTy == S16;
3747   const bool IsA16 = AddrTy == S16;
3748 
3749   int DMaskLanes = 0;
3750   if (!BaseOpcode->Atomic) {
3751     DMask = MI.getOperand(DMaskIdx).getImm();
3752     if (BaseOpcode->Gather4) {
3753       DMaskLanes = 4;
3754     } else if (DMask != 0) {
3755       DMaskLanes = countPopulation(DMask);
3756     } else if (!IsTFE && !BaseOpcode->Store) {
3757       // If dmask is 0, this is a no-op load. This can be eliminated.
3758       B.buildUndef(MI.getOperand(0));
3759       MI.eraseFromParent();
3760       return true;
3761     }
3762   }
3763 
3764   Observer.changingInstr(MI);
3765   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3766 
3767   unsigned NewOpcode = NumDefs == 0 ?
3768     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3769 
3770   // Track that we legalized this
3771   MI.setDesc(B.getTII().get(NewOpcode));
3772 
3773   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3774   // dmask to be at least 1 otherwise the instruction will fail
3775   if (IsTFE && DMask == 0) {
3776     DMask = 0x1;
3777     DMaskLanes = 1;
3778     MI.getOperand(DMaskIdx).setImm(DMask);
3779   }
3780 
3781   if (BaseOpcode->Atomic) {
3782     Register VData0 = MI.getOperand(2).getReg();
3783     LLT Ty = MRI->getType(VData0);
3784 
3785     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3786     if (Ty.isVector())
3787       return false;
3788 
3789     if (BaseOpcode->AtomicX2) {
3790       Register VData1 = MI.getOperand(3).getReg();
3791       // The two values are packed in one register.
3792       LLT PackedTy = LLT::vector(2, Ty);
3793       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3794       MI.getOperand(2).setReg(Concat.getReg(0));
3795       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3796     }
3797   }
3798 
3799   int CorrectedNumVAddrs = NumVAddrs;
3800 
3801   // Optimize _L to _LZ when _L is zero
3802   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3803         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3804     const ConstantFP *ConstantLod;
3805     const int LodIdx = AddrIdx + NumVAddrs - 1;
3806 
3807     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3808       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3809         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3810         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3811           LZMappingInfo->LZ, ImageDimIntr->Dim);
3812 
3813         // The starting indexes should remain in the same place.
3814         --NumVAddrs;
3815         --CorrectedNumVAddrs;
3816 
3817         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3818           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3819         MI.RemoveOperand(LodIdx);
3820       }
3821     }
3822   }
3823 
3824   // Optimize _mip away, when 'lod' is zero
3825   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3826     int64_t ConstantLod;
3827     const int LodIdx = AddrIdx + NumVAddrs - 1;
3828 
3829     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3830       if (ConstantLod == 0) {
3831         // TODO: Change intrinsic opcode and remove operand instead or replacing
3832         // it with 0, as the _L to _LZ handling is done above.
3833         MI.getOperand(LodIdx).ChangeToImmediate(0);
3834         --CorrectedNumVAddrs;
3835       }
3836     }
3837   }
3838 
3839   // Rewrite the addressing register layout before doing anything else.
3840   if (IsA16 || IsG16) {
3841     if (IsA16) {
3842       // Target must support the feature and gradients need to be 16 bit too
3843       if (!ST.hasA16() || !IsG16)
3844         return false;
3845     } else if (!ST.hasG16())
3846       return false;
3847 
3848     if (NumVAddrs > 1) {
3849       SmallVector<Register, 4> PackedRegs;
3850       // Don't compress addresses for G16
3851       const int PackEndIdx =
3852           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3853       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3854                                   PackEndIdx, NumGradients);
3855 
3856       if (!IsA16) {
3857         // Add uncompressed address
3858         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3859           int AddrReg = MI.getOperand(I).getReg();
3860           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3861           PackedRegs.push_back(AddrReg);
3862         }
3863       }
3864 
3865       // See also below in the non-a16 branch
3866       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3867 
3868       if (!UseNSA && PackedRegs.size() > 1) {
3869         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3870         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3871         PackedRegs[0] = Concat.getReg(0);
3872         PackedRegs.resize(1);
3873       }
3874 
3875       const int NumPacked = PackedRegs.size();
3876       for (int I = 0; I != NumVAddrs; ++I) {
3877         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3878         if (!SrcOp.isReg()) {
3879           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3880           continue;
3881         }
3882 
3883         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3884 
3885         if (I < NumPacked)
3886           SrcOp.setReg(PackedRegs[I]);
3887         else
3888           SrcOp.setReg(AMDGPU::NoRegister);
3889       }
3890     }
3891   } else {
3892     // If the register allocator cannot place the address registers contiguously
3893     // without introducing moves, then using the non-sequential address encoding
3894     // is always preferable, since it saves VALU instructions and is usually a
3895     // wash in terms of code size or even better.
3896     //
3897     // However, we currently have no way of hinting to the register allocator
3898     // that MIMG addresses should be placed contiguously when it is possible to
3899     // do so, so force non-NSA for the common 2-address case as a heuristic.
3900     //
3901     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3902     // allocation when possible.
3903     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3904 
3905     if (!UseNSA && NumVAddrs > 1)
3906       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3907   }
3908 
3909   int Flags = 0;
3910   if (IsA16)
3911     Flags |= 1;
3912   if (IsG16)
3913     Flags |= 2;
3914   MI.addOperand(MachineOperand::CreateImm(Flags));
3915 
3916   if (BaseOpcode->Store) { // No TFE for stores?
3917     // TODO: Handle dmask trim
3918     Register VData = MI.getOperand(1).getReg();
3919     LLT Ty = MRI->getType(VData);
3920     if (!Ty.isVector() || Ty.getElementType() != S16)
3921       return true;
3922 
3923     Register RepackedReg = handleD16VData(B, *MRI, VData);
3924     if (RepackedReg != VData) {
3925       MI.getOperand(1).setReg(RepackedReg);
3926     }
3927 
3928     return true;
3929   }
3930 
3931   Register DstReg = MI.getOperand(0).getReg();
3932   LLT Ty = MRI->getType(DstReg);
3933   const LLT EltTy = Ty.getScalarType();
3934   const bool IsD16 = Ty.getScalarType() == S16;
3935   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3936 
3937   // Confirm that the return type is large enough for the dmask specified
3938   if (NumElts < DMaskLanes)
3939     return false;
3940 
3941   if (NumElts > 4 || DMaskLanes > 4)
3942     return false;
3943 
3944   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3945   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3946 
3947   // The raw dword aligned data component of the load. The only legal cases
3948   // where this matters should be when using the packed D16 format, for
3949   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3950   LLT RoundedTy;
3951 
3952   // S32 vector to to cover all data, plus TFE result element.
3953   LLT TFETy;
3954 
3955   // Register type to use for each loaded component. Will be S32 or V2S16.
3956   LLT RegTy;
3957 
3958   if (IsD16 && ST.hasUnpackedD16VMem()) {
3959     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3960     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3961     RegTy = S32;
3962   } else {
3963     unsigned EltSize = EltTy.getSizeInBits();
3964     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3965     unsigned RoundedSize = 32 * RoundedElts;
3966     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3967     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3968     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3969   }
3970 
3971   // The return type does not need adjustment.
3972   // TODO: Should we change s16 case to s32 or <2 x s16>?
3973   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3974     return true;
3975 
3976   Register Dst1Reg;
3977 
3978   // Insert after the instruction.
3979   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3980 
3981   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3982   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3983   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3984   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3985 
3986   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3987 
3988   MI.getOperand(0).setReg(NewResultReg);
3989 
3990   // In the IR, TFE is supposed to be used with a 2 element struct return
3991   // type. The intruction really returns these two values in one contiguous
3992   // register, with one additional dword beyond the loaded data. Rewrite the
3993   // return type to use a single register result.
3994 
3995   if (IsTFE) {
3996     Dst1Reg = MI.getOperand(1).getReg();
3997     if (MRI->getType(Dst1Reg) != S32)
3998       return false;
3999 
4000     // TODO: Make sure the TFE operand bit is set.
4001     MI.RemoveOperand(1);
4002 
4003     // Handle the easy case that requires no repack instructions.
4004     if (Ty == S32) {
4005       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4006       return true;
4007     }
4008   }
4009 
4010   // Now figure out how to copy the new result register back into the old
4011   // result.
4012   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4013 
4014   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4015 
4016   if (ResultNumRegs == 1) {
4017     assert(!IsTFE);
4018     ResultRegs[0] = NewResultReg;
4019   } else {
4020     // We have to repack into a new vector of some kind.
4021     for (int I = 0; I != NumDataRegs; ++I)
4022       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4023     B.buildUnmerge(ResultRegs, NewResultReg);
4024 
4025     // Drop the final TFE element to get the data part. The TFE result is
4026     // directly written to the right place already.
4027     if (IsTFE)
4028       ResultRegs.resize(NumDataRegs);
4029   }
4030 
4031   // For an s16 scalar result, we form an s32 result with a truncate regardless
4032   // of packed vs. unpacked.
4033   if (IsD16 && !Ty.isVector()) {
4034     B.buildTrunc(DstReg, ResultRegs[0]);
4035     return true;
4036   }
4037 
4038   // Avoid a build/concat_vector of 1 entry.
4039   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4040     B.buildBitcast(DstReg, ResultRegs[0]);
4041     return true;
4042   }
4043 
4044   assert(Ty.isVector());
4045 
4046   if (IsD16) {
4047     // For packed D16 results with TFE enabled, all the data components are
4048     // S32. Cast back to the expected type.
4049     //
4050     // TODO: We don't really need to use load s32 elements. We would only need one
4051     // cast for the TFE result if a multiple of v2s16 was used.
4052     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4053       for (Register &Reg : ResultRegs)
4054         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4055     } else if (ST.hasUnpackedD16VMem()) {
4056       for (Register &Reg : ResultRegs)
4057         Reg = B.buildTrunc(S16, Reg).getReg(0);
4058     }
4059   }
4060 
4061   auto padWithUndef = [&](LLT Ty, int NumElts) {
4062     if (NumElts == 0)
4063       return;
4064     Register Undef = B.buildUndef(Ty).getReg(0);
4065     for (int I = 0; I != NumElts; ++I)
4066       ResultRegs.push_back(Undef);
4067   };
4068 
4069   // Pad out any elements eliminated due to the dmask.
4070   LLT ResTy = MRI->getType(ResultRegs[0]);
4071   if (!ResTy.isVector()) {
4072     padWithUndef(ResTy, NumElts - ResultRegs.size());
4073     B.buildBuildVector(DstReg, ResultRegs);
4074     return true;
4075   }
4076 
4077   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4078   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4079 
4080   // Deal with the one annoying legal case.
4081   const LLT V3S16 = LLT::vector(3, 16);
4082   if (Ty == V3S16) {
4083     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4084     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4085     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4086     return true;
4087   }
4088 
4089   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4090   B.buildConcatVectors(DstReg, ResultRegs);
4091   return true;
4092 }
4093 
4094 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4095   MachineInstr &MI, MachineIRBuilder &B,
4096   GISelChangeObserver &Observer) const {
4097   Register Dst = MI.getOperand(0).getReg();
4098   LLT Ty = B.getMRI()->getType(Dst);
4099   unsigned Size = Ty.getSizeInBits();
4100   MachineFunction &MF = B.getMF();
4101 
4102   Observer.changingInstr(MI);
4103 
4104   // FIXME: We don't really need this intermediate instruction. The intrinsic
4105   // should be fixed to have a memory operand. Since it's readnone, we're not
4106   // allowed to add one.
4107   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4108   MI.RemoveOperand(1); // Remove intrinsic ID
4109 
4110   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4111   // TODO: Should this use datalayout alignment?
4112   const unsigned MemSize = (Size + 7) / 8;
4113   const Align MemAlign(4);
4114   MachineMemOperand *MMO = MF.getMachineMemOperand(
4115       MachinePointerInfo(),
4116       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4117           MachineMemOperand::MOInvariant,
4118       MemSize, MemAlign);
4119   MI.addMemOperand(MF, MMO);
4120 
4121   // There are no 96-bit result scalar loads, but widening to 128-bit should
4122   // always be legal. We may need to restore this to a 96-bit result if it turns
4123   // out this needs to be converted to a vector load during RegBankSelect.
4124   if (!isPowerOf2_32(Size)) {
4125     LegalizerHelper Helper(MF, *this, Observer, B);
4126 
4127     if (Ty.isVector())
4128       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4129     else
4130       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4131   }
4132 
4133   Observer.changedInstr(MI);
4134   return true;
4135 }
4136 
4137 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4138                                                 MachineRegisterInfo &MRI,
4139                                                 MachineIRBuilder &B) const {
4140   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4141   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4142       !ST.isTrapHandlerEnabled()) {
4143     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4144   } else {
4145     // Pass queue pointer to trap handler as input, and insert trap instruction
4146     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4147     const ArgDescriptor *Arg =
4148         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4149     if (!Arg)
4150       return false;
4151     MachineRegisterInfo &MRI = *B.getMRI();
4152     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4153     Register LiveIn = getLiveInRegister(
4154         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4155         /*InsertLiveInCopy=*/false);
4156     if (!loadInputValue(LiveIn, B, Arg))
4157       return false;
4158     B.buildCopy(SGPR01, LiveIn);
4159     B.buildInstr(AMDGPU::S_TRAP)
4160         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4161         .addReg(SGPR01, RegState::Implicit);
4162   }
4163 
4164   MI.eraseFromParent();
4165   return true;
4166 }
4167 
4168 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4169     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4170   // Is non-HSA path or trap-handler disabled? then, report a warning
4171   // accordingly
4172   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4173       !ST.isTrapHandlerEnabled()) {
4174     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4175                                      "debugtrap handler not supported",
4176                                      MI.getDebugLoc(), DS_Warning);
4177     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4178     Ctx.diagnose(NoTrap);
4179   } else {
4180     // Insert debug-trap instruction
4181     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4182   }
4183 
4184   MI.eraseFromParent();
4185   return true;
4186 }
4187 
4188 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4189                                             MachineIRBuilder &B,
4190                                             GISelChangeObserver &Observer) const {
4191   MachineRegisterInfo &MRI = *B.getMRI();
4192 
4193   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4194   auto IntrID = MI.getIntrinsicID();
4195   switch (IntrID) {
4196   case Intrinsic::amdgcn_if:
4197   case Intrinsic::amdgcn_else: {
4198     MachineInstr *Br = nullptr;
4199     MachineBasicBlock *UncondBrTarget = nullptr;
4200     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4201       const SIRegisterInfo *TRI
4202         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4203 
4204       Register Def = MI.getOperand(1).getReg();
4205       Register Use = MI.getOperand(3).getReg();
4206 
4207       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4208       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4209       if (IntrID == Intrinsic::amdgcn_if) {
4210         B.buildInstr(AMDGPU::SI_IF)
4211           .addDef(Def)
4212           .addUse(Use)
4213           .addMBB(UncondBrTarget);
4214       } else {
4215         B.buildInstr(AMDGPU::SI_ELSE)
4216           .addDef(Def)
4217           .addUse(Use)
4218           .addMBB(UncondBrTarget)
4219           .addImm(0);
4220       }
4221 
4222       if (Br) {
4223         Br->getOperand(0).setMBB(CondBrTarget);
4224       } else {
4225         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4226         // since we're swapping branch targets it needs to be reinserted.
4227         // FIXME: IRTranslator should probably not do this
4228         B.buildBr(*CondBrTarget);
4229       }
4230 
4231       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4232       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4233       MI.eraseFromParent();
4234       BrCond->eraseFromParent();
4235       return true;
4236     }
4237 
4238     return false;
4239   }
4240   case Intrinsic::amdgcn_loop: {
4241     MachineInstr *Br = nullptr;
4242     MachineBasicBlock *UncondBrTarget = nullptr;
4243     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4244       const SIRegisterInfo *TRI
4245         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4246 
4247       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4248       Register Reg = MI.getOperand(2).getReg();
4249 
4250       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4251       B.buildInstr(AMDGPU::SI_LOOP)
4252         .addUse(Reg)
4253         .addMBB(UncondBrTarget);
4254 
4255       if (Br)
4256         Br->getOperand(0).setMBB(CondBrTarget);
4257       else
4258         B.buildBr(*CondBrTarget);
4259 
4260       MI.eraseFromParent();
4261       BrCond->eraseFromParent();
4262       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4263       return true;
4264     }
4265 
4266     return false;
4267   }
4268   case Intrinsic::amdgcn_kernarg_segment_ptr:
4269     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4270       // This only makes sense to call in a kernel, so just lower to null.
4271       B.buildConstant(MI.getOperand(0).getReg(), 0);
4272       MI.eraseFromParent();
4273       return true;
4274     }
4275 
4276     return legalizePreloadedArgIntrin(
4277       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4278   case Intrinsic::amdgcn_implicitarg_ptr:
4279     return legalizeImplicitArgPtr(MI, MRI, B);
4280   case Intrinsic::amdgcn_workitem_id_x:
4281     return legalizePreloadedArgIntrin(MI, MRI, B,
4282                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4283   case Intrinsic::amdgcn_workitem_id_y:
4284     return legalizePreloadedArgIntrin(MI, MRI, B,
4285                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4286   case Intrinsic::amdgcn_workitem_id_z:
4287     return legalizePreloadedArgIntrin(MI, MRI, B,
4288                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4289   case Intrinsic::amdgcn_workgroup_id_x:
4290     return legalizePreloadedArgIntrin(MI, MRI, B,
4291                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4292   case Intrinsic::amdgcn_workgroup_id_y:
4293     return legalizePreloadedArgIntrin(MI, MRI, B,
4294                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4295   case Intrinsic::amdgcn_workgroup_id_z:
4296     return legalizePreloadedArgIntrin(MI, MRI, B,
4297                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4298   case Intrinsic::amdgcn_dispatch_ptr:
4299     return legalizePreloadedArgIntrin(MI, MRI, B,
4300                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4301   case Intrinsic::amdgcn_queue_ptr:
4302     return legalizePreloadedArgIntrin(MI, MRI, B,
4303                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4304   case Intrinsic::amdgcn_implicit_buffer_ptr:
4305     return legalizePreloadedArgIntrin(
4306       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4307   case Intrinsic::amdgcn_dispatch_id:
4308     return legalizePreloadedArgIntrin(MI, MRI, B,
4309                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4310   case Intrinsic::amdgcn_fdiv_fast:
4311     return legalizeFDIVFastIntrin(MI, MRI, B);
4312   case Intrinsic::amdgcn_is_shared:
4313     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4314   case Intrinsic::amdgcn_is_private:
4315     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4316   case Intrinsic::amdgcn_wavefrontsize: {
4317     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4318     MI.eraseFromParent();
4319     return true;
4320   }
4321   case Intrinsic::amdgcn_s_buffer_load:
4322     return legalizeSBufferLoad(MI, B, Observer);
4323   case Intrinsic::amdgcn_raw_buffer_store:
4324   case Intrinsic::amdgcn_struct_buffer_store:
4325     return legalizeBufferStore(MI, MRI, B, false, false);
4326   case Intrinsic::amdgcn_raw_buffer_store_format:
4327   case Intrinsic::amdgcn_struct_buffer_store_format:
4328     return legalizeBufferStore(MI, MRI, B, false, true);
4329   case Intrinsic::amdgcn_raw_tbuffer_store:
4330   case Intrinsic::amdgcn_struct_tbuffer_store:
4331     return legalizeBufferStore(MI, MRI, B, true, true);
4332   case Intrinsic::amdgcn_raw_buffer_load:
4333   case Intrinsic::amdgcn_struct_buffer_load:
4334     return legalizeBufferLoad(MI, MRI, B, false, false);
4335   case Intrinsic::amdgcn_raw_buffer_load_format:
4336   case Intrinsic::amdgcn_struct_buffer_load_format:
4337     return legalizeBufferLoad(MI, MRI, B, true, false);
4338   case Intrinsic::amdgcn_raw_tbuffer_load:
4339   case Intrinsic::amdgcn_struct_tbuffer_load:
4340     return legalizeBufferLoad(MI, MRI, B, true, true);
4341   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4342   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4343   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4344   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4345   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4346   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4347   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4348   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4349   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4350   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4351   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4352   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4353   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4354   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4355   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4356   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4357   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4358   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4359   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4360   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4361   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4362   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4363   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4364   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4365   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4366   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4367     return legalizeBufferAtomic(MI, B, IntrID);
4368   case Intrinsic::amdgcn_atomic_inc:
4369     return legalizeAtomicIncDec(MI, B, true);
4370   case Intrinsic::amdgcn_atomic_dec:
4371     return legalizeAtomicIncDec(MI, B, false);
4372   case Intrinsic::trap:
4373     return legalizeTrapIntrinsic(MI, MRI, B);
4374   case Intrinsic::debugtrap:
4375     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4376   default: {
4377     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4378             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4379       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4380     return true;
4381   }
4382   }
4383 
4384   return true;
4385 }
4386