1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .legalIf(isPointer(0))
419     .clampScalar(0, S32, S256)
420     .widenScalarToNextPow2(0, 32)
421     .clampMaxNumElements(0, S32, 16)
422     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
423     .scalarize(0);
424 
425   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
426     // Full set of gfx9 features.
427     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
428       .legalFor({S32, S16, V2S16})
429       .clampScalar(0, S16, S32)
430       .clampMaxNumElements(0, S16, 2)
431       .scalarize(0)
432       .widenScalarToNextPow2(0, 32);
433 
434     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
435       .legalFor({S32, S16, V2S16}) // Clamp modifier
436       .minScalar(0, S16)
437       .clampMaxNumElements(0, S16, 2)
438       .scalarize(0)
439       .widenScalarToNextPow2(0, 32)
440       .lower();
441   } else if (ST.has16BitInsts()) {
442     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
443       .legalFor({S32, S16})
444       .clampScalar(0, S16, S32)
445       .scalarize(0)
446       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
447 
448     // Technically the saturating operations require clamp bit support, but this
449     // was introduced at the same time as 16-bit operations.
450     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
451       .legalFor({S32, S16}) // Clamp modifier
452       .minScalar(0, S16)
453       .scalarize(0)
454       .widenScalarToNextPow2(0, 16)
455       .lower();
456 
457     // We're just lowering this, but it helps get a better result to try to
458     // coerce to the desired type first.
459     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
460       .minScalar(0, S16)
461       .scalarize(0)
462       .lower();
463   } else {
464     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
465       .legalFor({S32})
466       .clampScalar(0, S32, S32)
467       .scalarize(0);
468 
469     if (ST.hasIntClamp()) {
470       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
471         .legalFor({S32}) // Clamp modifier.
472         .scalarize(0)
473         .minScalarOrElt(0, S32)
474         .lower();
475     } else {
476       // Clamp bit support was added in VI, along with 16-bit operations.
477       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
478         .minScalar(0, S32)
479         .scalarize(0)
480         .lower();
481     }
482 
483     // FIXME: DAG expansion gets better results. The widening uses the smaller
484     // range values and goes for the min/max lowering directly.
485     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
486       .minScalar(0, S32)
487       .scalarize(0)
488       .lower();
489   }
490 
491   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
492     .customFor({S32, S64})
493     .clampScalar(0, S32, S64)
494     .widenScalarToNextPow2(0, 32)
495     .scalarize(0);
496 
497   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
498     .legalFor({S32})
499     .clampScalar(0, S32, S32)
500     .scalarize(0);
501 
502   // Report legal for any types we can handle anywhere. For the cases only legal
503   // on the SALU, RegBankSelect will be able to re-legalize.
504   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
505     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
506     .clampScalar(0, S32, S64)
507     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
508     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
509     .widenScalarToNextPow2(0)
510     .scalarize(0);
511 
512   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
513                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
514     .legalFor({{S32, S1}, {S32, S32}})
515     .minScalar(0, S32)
516     // TODO: .scalarize(0)
517     .lower();
518 
519   getActionDefinitionsBuilder(G_BITCAST)
520     // Don't worry about the size constraint.
521     .legalIf(all(isRegisterType(0), isRegisterType(1)))
522     .lower();
523 
524 
525   getActionDefinitionsBuilder(G_CONSTANT)
526     .legalFor({S1, S32, S64, S16, GlobalPtr,
527                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
528     .legalIf(isPointer(0))
529     .clampScalar(0, S32, S64)
530     .widenScalarToNextPow2(0);
531 
532   getActionDefinitionsBuilder(G_FCONSTANT)
533     .legalFor({S32, S64, S16})
534     .clampScalar(0, S16, S64);
535 
536   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
537       .legalIf(isRegisterType(0))
538       // s1 and s16 are special cases because they have legal operations on
539       // them, but don't really occupy registers in the normal way.
540       .legalFor({S1, S16})
541       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
542       .clampScalarOrElt(0, S32, MaxScalar)
543       .widenScalarToNextPow2(0, 32)
544       .clampMaxNumElements(0, S32, 16);
545 
546   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
547 
548   // If the amount is divergent, we have to do a wave reduction to get the
549   // maximum value, so this is expanded during RegBankSelect.
550   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
551     .legalFor({{PrivatePtr, S32}});
552 
553   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
554     .customIf(typeIsNot(0, PrivatePtr));
555 
556   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
557 
558   auto &FPOpActions = getActionDefinitionsBuilder(
559     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
560     .legalFor({S32, S64});
561   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
562     .customFor({S32, S64});
563   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
564     .customFor({S32, S64});
565 
566   if (ST.has16BitInsts()) {
567     if (ST.hasVOP3PInsts())
568       FPOpActions.legalFor({S16, V2S16});
569     else
570       FPOpActions.legalFor({S16});
571 
572     TrigActions.customFor({S16});
573     FDIVActions.customFor({S16});
574   }
575 
576   auto &MinNumMaxNum = getActionDefinitionsBuilder({
577       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
578 
579   if (ST.hasVOP3PInsts()) {
580     MinNumMaxNum.customFor(FPTypesPK16)
581       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
582       .clampMaxNumElements(0, S16, 2)
583       .clampScalar(0, S16, S64)
584       .scalarize(0);
585   } else if (ST.has16BitInsts()) {
586     MinNumMaxNum.customFor(FPTypes16)
587       .clampScalar(0, S16, S64)
588       .scalarize(0);
589   } else {
590     MinNumMaxNum.customFor(FPTypesBase)
591       .clampScalar(0, S32, S64)
592       .scalarize(0);
593   }
594 
595   if (ST.hasVOP3PInsts())
596     FPOpActions.clampMaxNumElements(0, S16, 2);
597 
598   FPOpActions
599     .scalarize(0)
600     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
601 
602   TrigActions
603     .scalarize(0)
604     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
605 
606   FDIVActions
607     .scalarize(0)
608     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
609 
610   getActionDefinitionsBuilder({G_FNEG, G_FABS})
611     .legalFor(FPTypesPK16)
612     .clampMaxNumElements(0, S16, 2)
613     .scalarize(0)
614     .clampScalar(0, S16, S64);
615 
616   if (ST.has16BitInsts()) {
617     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
618       .legalFor({S32, S64, S16})
619       .scalarize(0)
620       .clampScalar(0, S16, S64);
621   } else {
622     getActionDefinitionsBuilder(G_FSQRT)
623       .legalFor({S32, S64})
624       .scalarize(0)
625       .clampScalar(0, S32, S64);
626 
627     if (ST.hasFractBug()) {
628       getActionDefinitionsBuilder(G_FFLOOR)
629         .customFor({S64})
630         .legalFor({S32, S64})
631         .scalarize(0)
632         .clampScalar(0, S32, S64);
633     } else {
634       getActionDefinitionsBuilder(G_FFLOOR)
635         .legalFor({S32, S64})
636         .scalarize(0)
637         .clampScalar(0, S32, S64);
638     }
639   }
640 
641   getActionDefinitionsBuilder(G_FPTRUNC)
642     .legalFor({{S32, S64}, {S16, S32}})
643     .scalarize(0)
644     .lower();
645 
646   getActionDefinitionsBuilder(G_FPEXT)
647     .legalFor({{S64, S32}, {S32, S16}})
648     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
649     .scalarize(0);
650 
651   getActionDefinitionsBuilder(G_FSUB)
652       // Use actual fsub instruction
653       .legalFor({S32})
654       // Must use fadd + fneg
655       .lowerFor({S64, S16, V2S16})
656       .scalarize(0)
657       .clampScalar(0, S32, S64);
658 
659   // Whether this is legal depends on the floating point mode for the function.
660   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
661   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
662     FMad.customFor({S32, S16});
663   else if (ST.hasMadMacF32Insts())
664     FMad.customFor({S32});
665   else if (ST.hasMadF16())
666     FMad.customFor({S16});
667   FMad.scalarize(0)
668       .lower();
669 
670   // TODO: Do we need to clamp maximum bitwidth?
671   getActionDefinitionsBuilder(G_TRUNC)
672     .legalIf(isScalar(0))
673     .legalFor({{V2S16, V2S32}})
674     .clampMaxNumElements(0, S16, 2)
675     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
676     // situations (like an invalid implicit use), we don't want to infinite loop
677     // in the legalizer.
678     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
679     .alwaysLegal();
680 
681   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
682     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
683                {S32, S1}, {S64, S1}, {S16, S1}})
684     .scalarize(0)
685     .clampScalar(0, S32, S64)
686     .widenScalarToNextPow2(1, 32);
687 
688   // TODO: Split s1->s64 during regbankselect for VALU.
689   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
690     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
691     .lowerFor({{S32, S64}})
692     .lowerIf(typeIs(1, S1))
693     .customFor({{S64, S64}});
694   if (ST.has16BitInsts())
695     IToFP.legalFor({{S16, S16}});
696   IToFP.clampScalar(1, S32, S64)
697        .minScalar(0, S32)
698        .scalarize(0)
699        .widenScalarToNextPow2(1);
700 
701   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
702     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
703     .customFor({{S64, S64}})
704     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
705   if (ST.has16BitInsts())
706     FPToI.legalFor({{S16, S16}});
707   else
708     FPToI.minScalar(1, S32);
709 
710   FPToI.minScalar(0, S32)
711        .scalarize(0)
712        .lower();
713 
714   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
715     .scalarize(0)
716     .lower();
717 
718   if (ST.has16BitInsts()) {
719     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
720       .legalFor({S16, S32, S64})
721       .clampScalar(0, S16, S64)
722       .scalarize(0);
723   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
724     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
725       .legalFor({S32, S64})
726       .clampScalar(0, S32, S64)
727       .scalarize(0);
728   } else {
729     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
730       .legalFor({S32})
731       .customFor({S64})
732       .clampScalar(0, S32, S64)
733       .scalarize(0);
734   }
735 
736   getActionDefinitionsBuilder(G_PTR_ADD)
737     .legalIf(all(isPointer(0), sameSize(0, 1)))
738     .scalarize(0)
739     .scalarSameSizeAs(1, 0);
740 
741   getActionDefinitionsBuilder(G_PTRMASK)
742     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
743     .scalarSameSizeAs(1, 0)
744     .scalarize(0);
745 
746   auto &CmpBuilder =
747     getActionDefinitionsBuilder(G_ICMP)
748     // The compare output type differs based on the register bank of the output,
749     // so make both s1 and s32 legal.
750     //
751     // Scalar compares producing output in scc will be promoted to s32, as that
752     // is the allocatable register type that will be needed for the copy from
753     // scc. This will be promoted during RegBankSelect, and we assume something
754     // before that won't try to use s32 result types.
755     //
756     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
757     // bank.
758     .legalForCartesianProduct(
759       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
760     .legalForCartesianProduct(
761       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
762   if (ST.has16BitInsts()) {
763     CmpBuilder.legalFor({{S1, S16}});
764   }
765 
766   CmpBuilder
767     .widenScalarToNextPow2(1)
768     .clampScalar(1, S32, S64)
769     .scalarize(0)
770     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
771 
772   getActionDefinitionsBuilder(G_FCMP)
773     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
774     .widenScalarToNextPow2(1)
775     .clampScalar(1, S32, S64)
776     .scalarize(0);
777 
778   // FIXME: fpow has a selection pattern that should move to custom lowering.
779   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
780   if (ST.has16BitInsts())
781     Exp2Ops.legalFor({S32, S16});
782   else
783     Exp2Ops.legalFor({S32});
784   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
785   Exp2Ops.scalarize(0);
786 
787   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
788   if (ST.has16BitInsts())
789     ExpOps.customFor({{S32}, {S16}});
790   else
791     ExpOps.customFor({S32});
792   ExpOps.clampScalar(0, MinScalarFPTy, S32)
793         .scalarize(0);
794 
795   getActionDefinitionsBuilder(G_FPOWI)
796     .clampScalar(0, MinScalarFPTy, S32)
797     .lower();
798 
799   // The 64-bit versions produce 32-bit results, but only on the SALU.
800   getActionDefinitionsBuilder(G_CTPOP)
801     .legalFor({{S32, S32}, {S32, S64}})
802     .clampScalar(0, S32, S32)
803     .clampScalar(1, S32, S64)
804     .scalarize(0)
805     .widenScalarToNextPow2(0, 32)
806     .widenScalarToNextPow2(1, 32);
807 
808   // The hardware instructions return a different result on 0 than the generic
809   // instructions expect. The hardware produces -1, but these produce the
810   // bitwidth.
811   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
812     .scalarize(0)
813     .clampScalar(0, S32, S32)
814     .clampScalar(1, S32, S64)
815     .widenScalarToNextPow2(0, 32)
816     .widenScalarToNextPow2(1, 32)
817     .lower();
818 
819   // The 64-bit versions produce 32-bit results, but only on the SALU.
820   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
821     .legalFor({{S32, S32}, {S32, S64}})
822     .clampScalar(0, S32, S32)
823     .clampScalar(1, S32, S64)
824     .scalarize(0)
825     .widenScalarToNextPow2(0, 32)
826     .widenScalarToNextPow2(1, 32);
827 
828   getActionDefinitionsBuilder(G_BITREVERSE)
829     .legalFor({S32})
830     .clampScalar(0, S32, S32)
831     .scalarize(0);
832 
833   if (ST.has16BitInsts()) {
834     getActionDefinitionsBuilder(G_BSWAP)
835       .legalFor({S16, S32, V2S16})
836       .clampMaxNumElements(0, S16, 2)
837       // FIXME: Fixing non-power-of-2 before clamp is workaround for
838       // narrowScalar limitation.
839       .widenScalarToNextPow2(0)
840       .clampScalar(0, S16, S32)
841       .scalarize(0);
842 
843     if (ST.hasVOP3PInsts()) {
844       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
845         .legalFor({S32, S16, V2S16})
846         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
847         .clampMaxNumElements(0, S16, 2)
848         .minScalar(0, S16)
849         .widenScalarToNextPow2(0)
850         .scalarize(0)
851         .lower();
852     } else {
853       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
854         .legalFor({S32, S16})
855         .widenScalarToNextPow2(0)
856         .minScalar(0, S16)
857         .scalarize(0)
858         .lower();
859     }
860   } else {
861     // TODO: Should have same legality without v_perm_b32
862     getActionDefinitionsBuilder(G_BSWAP)
863       .legalFor({S32})
864       .lowerIf(scalarNarrowerThan(0, 32))
865       // FIXME: Fixing non-power-of-2 before clamp is workaround for
866       // narrowScalar limitation.
867       .widenScalarToNextPow2(0)
868       .maxScalar(0, S32)
869       .scalarize(0)
870       .lower();
871 
872     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
873       .legalFor({S32})
874       .minScalar(0, S32)
875       .widenScalarToNextPow2(0)
876       .scalarize(0)
877       .lower();
878   }
879 
880   getActionDefinitionsBuilder(G_INTTOPTR)
881     // List the common cases
882     .legalForCartesianProduct(AddrSpaces64, {S64})
883     .legalForCartesianProduct(AddrSpaces32, {S32})
884     .scalarize(0)
885     // Accept any address space as long as the size matches
886     .legalIf(sameSize(0, 1))
887     .widenScalarIf(smallerThan(1, 0),
888       [](const LegalityQuery &Query) {
889         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
890       })
891     .narrowScalarIf(largerThan(1, 0),
892       [](const LegalityQuery &Query) {
893         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
894       });
895 
896   getActionDefinitionsBuilder(G_PTRTOINT)
897     // List the common cases
898     .legalForCartesianProduct(AddrSpaces64, {S64})
899     .legalForCartesianProduct(AddrSpaces32, {S32})
900     .scalarize(0)
901     // Accept any address space as long as the size matches
902     .legalIf(sameSize(0, 1))
903     .widenScalarIf(smallerThan(0, 1),
904       [](const LegalityQuery &Query) {
905         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
906       })
907     .narrowScalarIf(
908       largerThan(0, 1),
909       [](const LegalityQuery &Query) {
910         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
911       });
912 
913   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
914     .scalarize(0)
915     .custom();
916 
917   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
918                                     bool IsLoad) -> bool {
919     const LLT DstTy = Query.Types[0];
920 
921     // Split vector extloads.
922     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
923     unsigned Align = Query.MMODescrs[0].AlignInBits;
924 
925     if (MemSize < DstTy.getSizeInBits())
926       MemSize = std::max(MemSize, Align);
927 
928     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
929       return true;
930 
931     const LLT PtrTy = Query.Types[1];
932     unsigned AS = PtrTy.getAddressSpace();
933     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
934       return true;
935 
936     // Catch weird sized loads that don't evenly divide into the access sizes
937     // TODO: May be able to widen depending on alignment etc.
938     unsigned NumRegs = (MemSize + 31) / 32;
939     if (NumRegs == 3) {
940       if (!ST.hasDwordx3LoadStores())
941         return true;
942     } else {
943       // If the alignment allows, these should have been widened.
944       if (!isPowerOf2_32(NumRegs))
945         return true;
946     }
947 
948     if (Align < MemSize) {
949       const SITargetLowering *TLI = ST.getTargetLowering();
950       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
951     }
952 
953     return false;
954   };
955 
956   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
957                                          unsigned Opc) -> bool {
958     unsigned Size = Query.Types[0].getSizeInBits();
959     if (isPowerOf2_32(Size))
960       return false;
961 
962     if (Size == 96 && ST.hasDwordx3LoadStores())
963       return false;
964 
965     unsigned AddrSpace = Query.Types[1].getAddressSpace();
966     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
967       return false;
968 
969     unsigned Align = Query.MMODescrs[0].AlignInBits;
970     unsigned RoundedSize = NextPowerOf2(Size);
971     return (Align >= RoundedSize);
972   };
973 
974   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
975   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
976   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
977 
978   // TODO: Refine based on subtargets which support unaligned access or 128-bit
979   // LDS
980   // TODO: Unsupported flat for SI.
981 
982   for (unsigned Op : {G_LOAD, G_STORE}) {
983     const bool IsStore = Op == G_STORE;
984 
985     auto &Actions = getActionDefinitionsBuilder(Op);
986     // Explicitly list some common cases.
987     // TODO: Does this help compile time at all?
988     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
989                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
990                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
991                                       {S64, GlobalPtr, 64, GlobalAlign32},
992                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
993                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
994                                       {S32, GlobalPtr, 8, GlobalAlign8},
995                                       {S32, GlobalPtr, 16, GlobalAlign16},
996 
997                                       {S32, LocalPtr, 32, 32},
998                                       {S64, LocalPtr, 64, 32},
999                                       {V2S32, LocalPtr, 64, 32},
1000                                       {S32, LocalPtr, 8, 8},
1001                                       {S32, LocalPtr, 16, 16},
1002                                       {V2S16, LocalPtr, 32, 32},
1003 
1004                                       {S32, PrivatePtr, 32, 32},
1005                                       {S32, PrivatePtr, 8, 8},
1006                                       {S32, PrivatePtr, 16, 16},
1007                                       {V2S16, PrivatePtr, 32, 32},
1008 
1009                                       {S32, ConstantPtr, 32, GlobalAlign32},
1010                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1011                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1012                                       {S64, ConstantPtr, 64, GlobalAlign32},
1013                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1014     Actions.legalIf(
1015       [=](const LegalityQuery &Query) -> bool {
1016         return isLoadStoreLegal(ST, Query, Op);
1017       });
1018 
1019     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1020     // 64-bits.
1021     //
1022     // TODO: Should generalize bitcast action into coerce, which will also cover
1023     // inserting addrspacecasts.
1024     Actions.customIf(typeIs(1, Constant32Ptr));
1025 
1026     // Turn any illegal element vectors into something easier to deal
1027     // with. These will ultimately produce 32-bit scalar shifts to extract the
1028     // parts anyway.
1029     //
1030     // For odd 16-bit element vectors, prefer to split those into pieces with
1031     // 16-bit vector parts.
1032     Actions.bitcastIf(
1033       [=](const LegalityQuery &Query) -> bool {
1034         const LLT Ty = Query.Types[0];
1035         const unsigned Size = Ty.getSizeInBits();
1036 
1037         if (Size != Query.MMODescrs[0].SizeInBits)
1038           return Size <= 32 && Ty.isVector();
1039 
1040         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
1041           return true;
1042         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
1043                !isRegisterVectorElementType(Ty.getElementType());
1044       }, bitcastToRegisterType(0));
1045 
1046     Actions
1047         .customIf(typeIs(1, Constant32Ptr))
1048         // Widen suitably aligned loads by loading extra elements.
1049         .moreElementsIf([=](const LegalityQuery &Query) {
1050             const LLT Ty = Query.Types[0];
1051             return Op == G_LOAD && Ty.isVector() &&
1052                    shouldWidenLoadResult(Query, Op);
1053           }, moreElementsToNextPow2(0))
1054         .widenScalarIf([=](const LegalityQuery &Query) {
1055             const LLT Ty = Query.Types[0];
1056             return Op == G_LOAD && !Ty.isVector() &&
1057                    shouldWidenLoadResult(Query, Op);
1058           }, widenScalarOrEltToNextPow2(0))
1059         .narrowScalarIf(
1060             [=](const LegalityQuery &Query) -> bool {
1061               return !Query.Types[0].isVector() &&
1062                      needToSplitMemOp(Query, Op == G_LOAD);
1063             },
1064             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1065               const LLT DstTy = Query.Types[0];
1066               const LLT PtrTy = Query.Types[1];
1067 
1068               const unsigned DstSize = DstTy.getSizeInBits();
1069               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1070 
1071               // Split extloads.
1072               if (DstSize > MemSize)
1073                 return std::make_pair(0, LLT::scalar(MemSize));
1074 
1075               if (!isPowerOf2_32(DstSize)) {
1076                 // We're probably decomposing an odd sized store. Try to split
1077                 // to the widest type. TODO: Account for alignment. As-is it
1078                 // should be OK, since the new parts will be further legalized.
1079                 unsigned FloorSize = PowerOf2Floor(DstSize);
1080                 return std::make_pair(0, LLT::scalar(FloorSize));
1081               }
1082 
1083               if (DstSize > 32 && (DstSize % 32 != 0)) {
1084                 // FIXME: Need a way to specify non-extload of larger size if
1085                 // suitably aligned.
1086                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1087               }
1088 
1089               unsigned MaxSize = maxSizeForAddrSpace(ST,
1090                                                      PtrTy.getAddressSpace(),
1091                                                      Op == G_LOAD);
1092               if (MemSize > MaxSize)
1093                 return std::make_pair(0, LLT::scalar(MaxSize));
1094 
1095               unsigned Align = Query.MMODescrs[0].AlignInBits;
1096               return std::make_pair(0, LLT::scalar(Align));
1097             })
1098         .fewerElementsIf(
1099             [=](const LegalityQuery &Query) -> bool {
1100               return Query.Types[0].isVector() &&
1101                      needToSplitMemOp(Query, Op == G_LOAD);
1102             },
1103             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1104               const LLT DstTy = Query.Types[0];
1105               const LLT PtrTy = Query.Types[1];
1106 
1107               LLT EltTy = DstTy.getElementType();
1108               unsigned MaxSize = maxSizeForAddrSpace(ST,
1109                                                      PtrTy.getAddressSpace(),
1110                                                      Op == G_LOAD);
1111 
1112               // FIXME: Handle widened to power of 2 results better. This ends
1113               // up scalarizing.
1114               // FIXME: 3 element stores scalarized on SI
1115 
1116               // Split if it's too large for the address space.
1117               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1118                 unsigned NumElts = DstTy.getNumElements();
1119                 unsigned EltSize = EltTy.getSizeInBits();
1120 
1121                 if (MaxSize % EltSize == 0) {
1122                   return std::make_pair(
1123                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1124                 }
1125 
1126                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1127 
1128                 // FIXME: Refine when odd breakdowns handled
1129                 // The scalars will need to be re-legalized.
1130                 if (NumPieces == 1 || NumPieces >= NumElts ||
1131                     NumElts % NumPieces != 0)
1132                   return std::make_pair(0, EltTy);
1133 
1134                 return std::make_pair(0,
1135                                       LLT::vector(NumElts / NumPieces, EltTy));
1136               }
1137 
1138               // FIXME: We could probably handle weird extending loads better.
1139               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1140               if (DstTy.getSizeInBits() > MemSize)
1141                 return std::make_pair(0, EltTy);
1142 
1143               unsigned EltSize = EltTy.getSizeInBits();
1144               unsigned DstSize = DstTy.getSizeInBits();
1145               if (!isPowerOf2_32(DstSize)) {
1146                 // We're probably decomposing an odd sized store. Try to split
1147                 // to the widest type. TODO: Account for alignment. As-is it
1148                 // should be OK, since the new parts will be further legalized.
1149                 unsigned FloorSize = PowerOf2Floor(DstSize);
1150                 return std::make_pair(
1151                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1152               }
1153 
1154               // Need to split because of alignment.
1155               unsigned Align = Query.MMODescrs[0].AlignInBits;
1156               if (EltSize > Align &&
1157                   (EltSize / Align < DstTy.getNumElements())) {
1158                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1159               }
1160 
1161               // May need relegalization for the scalars.
1162               return std::make_pair(0, EltTy);
1163             })
1164         .minScalar(0, S32);
1165 
1166     if (IsStore)
1167       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1168 
1169     // TODO: Need a bitcast lower option?
1170     Actions
1171         .widenScalarToNextPow2(0)
1172         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1173   }
1174 
1175   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1176                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1177                                                   {S32, GlobalPtr, 16, 2 * 8},
1178                                                   {S32, LocalPtr, 8, 8},
1179                                                   {S32, LocalPtr, 16, 16},
1180                                                   {S32, PrivatePtr, 8, 8},
1181                                                   {S32, PrivatePtr, 16, 16},
1182                                                   {S32, ConstantPtr, 8, 8},
1183                                                   {S32, ConstantPtr, 16, 2 * 8}});
1184   if (ST.hasFlatAddressSpace()) {
1185     ExtLoads.legalForTypesWithMemDesc(
1186         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1187   }
1188 
1189   ExtLoads.clampScalar(0, S32, S32)
1190           .widenScalarToNextPow2(0)
1191           .unsupportedIfMemSizeNotPow2()
1192           .lower();
1193 
1194   auto &Atomics = getActionDefinitionsBuilder(
1195     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1196      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1197      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1198      G_ATOMICRMW_UMIN})
1199     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1200                {S64, GlobalPtr}, {S64, LocalPtr},
1201                {S32, RegionPtr}, {S64, RegionPtr}});
1202   if (ST.hasFlatAddressSpace()) {
1203     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1204   }
1205 
1206   if (ST.hasLDSFPAtomics()) {
1207     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1208       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1209   }
1210 
1211   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1212   // demarshalling
1213   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1214     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1215                 {S32, FlatPtr}, {S64, FlatPtr}})
1216     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1217                {S32, RegionPtr}, {S64, RegionPtr}});
1218   // TODO: Pointer types, any 32-bit or 64-bit vector
1219 
1220   // Condition should be s32 for scalar, s1 for vector.
1221   getActionDefinitionsBuilder(G_SELECT)
1222     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1223           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1224           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1225     .clampScalar(0, S16, S64)
1226     .scalarize(1)
1227     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1228     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1229     .clampMaxNumElements(0, S32, 2)
1230     .clampMaxNumElements(0, LocalPtr, 2)
1231     .clampMaxNumElements(0, PrivatePtr, 2)
1232     .scalarize(0)
1233     .widenScalarToNextPow2(0)
1234     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1235 
1236   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1237   // be more flexible with the shift amount type.
1238   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1239     .legalFor({{S32, S32}, {S64, S32}});
1240   if (ST.has16BitInsts()) {
1241     if (ST.hasVOP3PInsts()) {
1242       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1243             .clampMaxNumElements(0, S16, 2);
1244     } else
1245       Shifts.legalFor({{S16, S16}});
1246 
1247     // TODO: Support 16-bit shift amounts for all types
1248     Shifts.widenScalarIf(
1249       [=](const LegalityQuery &Query) {
1250         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1251         // 32-bit amount.
1252         const LLT ValTy = Query.Types[0];
1253         const LLT AmountTy = Query.Types[1];
1254         return ValTy.getSizeInBits() <= 16 &&
1255                AmountTy.getSizeInBits() < 16;
1256       }, changeTo(1, S16));
1257     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1258     Shifts.clampScalar(1, S32, S32);
1259     Shifts.clampScalar(0, S16, S64);
1260     Shifts.widenScalarToNextPow2(0, 16);
1261   } else {
1262     // Make sure we legalize the shift amount type first, as the general
1263     // expansion for the shifted type will produce much worse code if it hasn't
1264     // been truncated already.
1265     Shifts.clampScalar(1, S32, S32);
1266     Shifts.clampScalar(0, S32, S64);
1267     Shifts.widenScalarToNextPow2(0, 32);
1268   }
1269   Shifts.scalarize(0);
1270 
1271   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1272     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1273     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1274     unsigned IdxTypeIdx = 2;
1275 
1276     getActionDefinitionsBuilder(Op)
1277       .customIf([=](const LegalityQuery &Query) {
1278           const LLT EltTy = Query.Types[EltTypeIdx];
1279           const LLT VecTy = Query.Types[VecTypeIdx];
1280           const LLT IdxTy = Query.Types[IdxTypeIdx];
1281           return (EltTy.getSizeInBits() == 16 ||
1282                   EltTy.getSizeInBits() % 32 == 0) &&
1283                  VecTy.getSizeInBits() % 32 == 0 &&
1284                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1285                  IdxTy.getSizeInBits() == 32;
1286         })
1287       .clampScalar(EltTypeIdx, S32, S64)
1288       .clampScalar(VecTypeIdx, S32, S64)
1289       .clampScalar(IdxTypeIdx, S32, S32);
1290   }
1291 
1292   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1293     .unsupportedIf([=](const LegalityQuery &Query) {
1294         const LLT &EltTy = Query.Types[1].getElementType();
1295         return Query.Types[0] != EltTy;
1296       });
1297 
1298   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1299     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1300     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1301 
1302     // FIXME: Doesn't handle extract of illegal sizes.
1303     getActionDefinitionsBuilder(Op)
1304       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1305       // FIXME: Multiples of 16 should not be legal.
1306       .legalIf([=](const LegalityQuery &Query) {
1307           const LLT BigTy = Query.Types[BigTyIdx];
1308           const LLT LitTy = Query.Types[LitTyIdx];
1309           return (BigTy.getSizeInBits() % 32 == 0) &&
1310                  (LitTy.getSizeInBits() % 16 == 0);
1311         })
1312       .widenScalarIf(
1313         [=](const LegalityQuery &Query) {
1314           const LLT BigTy = Query.Types[BigTyIdx];
1315           return (BigTy.getScalarSizeInBits() < 16);
1316         },
1317         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1318       .widenScalarIf(
1319         [=](const LegalityQuery &Query) {
1320           const LLT LitTy = Query.Types[LitTyIdx];
1321           return (LitTy.getScalarSizeInBits() < 16);
1322         },
1323         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1324       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1325       .widenScalarToNextPow2(BigTyIdx, 32);
1326 
1327   }
1328 
1329   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1330     .legalForCartesianProduct(AllS32Vectors, {S32})
1331     .legalForCartesianProduct(AllS64Vectors, {S64})
1332     .clampNumElements(0, V16S32, V32S32)
1333     .clampNumElements(0, V2S64, V16S64)
1334     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1335 
1336   if (ST.hasScalarPackInsts()) {
1337     BuildVector
1338       // FIXME: Should probably widen s1 vectors straight to s32
1339       .minScalarOrElt(0, S16)
1340       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1341       .minScalar(1, S32);
1342 
1343     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1344       .legalFor({V2S16, S32})
1345       .lower();
1346     BuildVector.minScalarOrElt(0, S32);
1347   } else {
1348     BuildVector.customFor({V2S16, S16});
1349     BuildVector.minScalarOrElt(0, S32);
1350 
1351     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1352       .customFor({V2S16, S32})
1353       .lower();
1354   }
1355 
1356   BuildVector.legalIf(isRegisterType(0));
1357 
1358   // FIXME: Clamp maximum size
1359   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1360     .legalIf(isRegisterType(0));
1361 
1362   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1363   // pre-legalize.
1364   if (ST.hasVOP3PInsts()) {
1365     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1366       .customFor({V2S16, V2S16})
1367       .lower();
1368   } else
1369     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1370 
1371   // Merge/Unmerge
1372   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1373     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1374     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1375 
1376     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1377       const LLT Ty = Query.Types[TypeIdx];
1378       if (Ty.isVector()) {
1379         const LLT &EltTy = Ty.getElementType();
1380         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1381           return true;
1382         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1383           return true;
1384       }
1385       return false;
1386     };
1387 
1388     auto &Builder = getActionDefinitionsBuilder(Op)
1389       .lowerFor({{S16, V2S16}})
1390       .lowerIf([=](const LegalityQuery &Query) {
1391           const LLT BigTy = Query.Types[BigTyIdx];
1392           return BigTy.getSizeInBits() == 32;
1393         })
1394       // Try to widen to s16 first for small types.
1395       // TODO: Only do this on targets with legal s16 shifts
1396       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1397       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1398       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1399       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1400                            elementTypeIs(1, S16)),
1401                        changeTo(1, V2S16))
1402       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1403       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1404       // valid.
1405       .clampScalar(LitTyIdx, S32, S512)
1406       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1407       // Break up vectors with weird elements into scalars
1408       .fewerElementsIf(
1409         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1410         scalarize(0))
1411       .fewerElementsIf(
1412         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1413         scalarize(1))
1414       .clampScalar(BigTyIdx, S32, MaxScalar);
1415 
1416     if (Op == G_MERGE_VALUES) {
1417       Builder.widenScalarIf(
1418         // TODO: Use 16-bit shifts if legal for 8-bit values?
1419         [=](const LegalityQuery &Query) {
1420           const LLT Ty = Query.Types[LitTyIdx];
1421           return Ty.getSizeInBits() < 32;
1422         },
1423         changeTo(LitTyIdx, S32));
1424     }
1425 
1426     Builder.widenScalarIf(
1427       [=](const LegalityQuery &Query) {
1428         const LLT Ty = Query.Types[BigTyIdx];
1429         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1430           Ty.getSizeInBits() % 16 != 0;
1431       },
1432       [=](const LegalityQuery &Query) {
1433         // Pick the next power of 2, or a multiple of 64 over 128.
1434         // Whichever is smaller.
1435         const LLT &Ty = Query.Types[BigTyIdx];
1436         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1437         if (NewSizeInBits >= 256) {
1438           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1439           if (RoundedTo < NewSizeInBits)
1440             NewSizeInBits = RoundedTo;
1441         }
1442         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1443       })
1444       .legalIf([=](const LegalityQuery &Query) {
1445           const LLT &BigTy = Query.Types[BigTyIdx];
1446           const LLT &LitTy = Query.Types[LitTyIdx];
1447 
1448           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1449             return false;
1450           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1451             return false;
1452 
1453           return BigTy.getSizeInBits() % 16 == 0 &&
1454                  LitTy.getSizeInBits() % 16 == 0 &&
1455                  BigTy.getSizeInBits() <= MaxRegisterSize;
1456         })
1457       // Any vectors left are the wrong size. Scalarize them.
1458       .scalarize(0)
1459       .scalarize(1);
1460   }
1461 
1462   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1463   // RegBankSelect.
1464   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1465     .legalFor({{S32}, {S64}});
1466 
1467   if (ST.hasVOP3PInsts()) {
1468     SextInReg.lowerFor({{V2S16}})
1469       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1470       // get more vector shift opportunities, since we'll get those when
1471       // expanded.
1472       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1473   } else if (ST.has16BitInsts()) {
1474     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1475   } else {
1476     // Prefer to promote to s32 before lowering if we don't have 16-bit
1477     // shifts. This avoid a lot of intermediate truncate and extend operations.
1478     SextInReg.lowerFor({{S32}, {S64}});
1479   }
1480 
1481   SextInReg
1482     .scalarize(0)
1483     .clampScalar(0, S32, S64)
1484     .lower();
1485 
1486   getActionDefinitionsBuilder(G_FSHR)
1487     .legalFor({{S32, S32}})
1488     .scalarize(0)
1489     .lower();
1490 
1491   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1492     .legalFor({S64});
1493 
1494   getActionDefinitionsBuilder(G_FENCE)
1495     .alwaysLegal();
1496 
1497   getActionDefinitionsBuilder({
1498       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1499       G_FCOPYSIGN,
1500 
1501       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1502       G_ATOMICRMW_NAND,
1503       G_ATOMICRMW_FSUB,
1504       G_READ_REGISTER,
1505       G_WRITE_REGISTER,
1506 
1507       G_SADDO, G_SSUBO,
1508 
1509        // TODO: Implement
1510       G_FMINIMUM, G_FMAXIMUM,
1511       G_FSHL
1512     }).lower();
1513 
1514   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1515         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1516         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1517     .unsupported();
1518 
1519   computeTables();
1520   verify(*ST.getInstrInfo());
1521 }
1522 
1523 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1524                                          MachineInstr &MI) const {
1525   MachineIRBuilder &B = Helper.MIRBuilder;
1526   MachineRegisterInfo &MRI = *B.getMRI();
1527   GISelChangeObserver &Observer = Helper.Observer;
1528 
1529   switch (MI.getOpcode()) {
1530   case TargetOpcode::G_ADDRSPACE_CAST:
1531     return legalizeAddrSpaceCast(MI, MRI, B);
1532   case TargetOpcode::G_FRINT:
1533     return legalizeFrint(MI, MRI, B);
1534   case TargetOpcode::G_FCEIL:
1535     return legalizeFceil(MI, MRI, B);
1536   case TargetOpcode::G_INTRINSIC_TRUNC:
1537     return legalizeIntrinsicTrunc(MI, MRI, B);
1538   case TargetOpcode::G_SITOFP:
1539     return legalizeITOFP(MI, MRI, B, true);
1540   case TargetOpcode::G_UITOFP:
1541     return legalizeITOFP(MI, MRI, B, false);
1542   case TargetOpcode::G_FPTOSI:
1543     return legalizeFPTOI(MI, MRI, B, true);
1544   case TargetOpcode::G_FPTOUI:
1545     return legalizeFPTOI(MI, MRI, B, false);
1546   case TargetOpcode::G_FMINNUM:
1547   case TargetOpcode::G_FMAXNUM:
1548   case TargetOpcode::G_FMINNUM_IEEE:
1549   case TargetOpcode::G_FMAXNUM_IEEE:
1550     return legalizeMinNumMaxNum(Helper, MI);
1551   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1552     return legalizeExtractVectorElt(MI, MRI, B);
1553   case TargetOpcode::G_INSERT_VECTOR_ELT:
1554     return legalizeInsertVectorElt(MI, MRI, B);
1555   case TargetOpcode::G_SHUFFLE_VECTOR:
1556     return legalizeShuffleVector(MI, MRI, B);
1557   case TargetOpcode::G_FSIN:
1558   case TargetOpcode::G_FCOS:
1559     return legalizeSinCos(MI, MRI, B);
1560   case TargetOpcode::G_GLOBAL_VALUE:
1561     return legalizeGlobalValue(MI, MRI, B);
1562   case TargetOpcode::G_LOAD:
1563     return legalizeLoad(MI, MRI, B, Observer);
1564   case TargetOpcode::G_FMAD:
1565     return legalizeFMad(MI, MRI, B);
1566   case TargetOpcode::G_FDIV:
1567     return legalizeFDIV(MI, MRI, B);
1568   case TargetOpcode::G_UDIV:
1569   case TargetOpcode::G_UREM:
1570     return legalizeUDIV_UREM(MI, MRI, B);
1571   case TargetOpcode::G_SDIV:
1572   case TargetOpcode::G_SREM:
1573     return legalizeSDIV_SREM(MI, MRI, B);
1574   case TargetOpcode::G_ATOMIC_CMPXCHG:
1575     return legalizeAtomicCmpXChg(MI, MRI, B);
1576   case TargetOpcode::G_FLOG:
1577     return legalizeFlog(MI, B, numbers::ln2f);
1578   case TargetOpcode::G_FLOG10:
1579     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1580   case TargetOpcode::G_FEXP:
1581     return legalizeFExp(MI, B);
1582   case TargetOpcode::G_FPOW:
1583     return legalizeFPow(MI, B);
1584   case TargetOpcode::G_FFLOOR:
1585     return legalizeFFloor(MI, MRI, B);
1586   case TargetOpcode::G_BUILD_VECTOR:
1587     return legalizeBuildVector(MI, MRI, B);
1588   default:
1589     return false;
1590   }
1591 
1592   llvm_unreachable("expected switch to return");
1593 }
1594 
1595 Register AMDGPULegalizerInfo::getSegmentAperture(
1596   unsigned AS,
1597   MachineRegisterInfo &MRI,
1598   MachineIRBuilder &B) const {
1599   MachineFunction &MF = B.getMF();
1600   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1601   const LLT S32 = LLT::scalar(32);
1602 
1603   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1604 
1605   if (ST.hasApertureRegs()) {
1606     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1607     // getreg.
1608     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1609         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1610         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1611     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1612         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1613         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1614     unsigned Encoding =
1615         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1616         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1617         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1618 
1619     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1620 
1621     B.buildInstr(AMDGPU::S_GETREG_B32)
1622       .addDef(GetReg)
1623       .addImm(Encoding);
1624     MRI.setType(GetReg, S32);
1625 
1626     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1627     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1628   }
1629 
1630   Register QueuePtr = MRI.createGenericVirtualRegister(
1631     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1632 
1633   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1634   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1635     return Register();
1636 
1637   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1638   // private_segment_aperture_base_hi.
1639   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1640 
1641   // TODO: can we be smarter about machine pointer info?
1642   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1643   MachineMemOperand *MMO = MF.getMachineMemOperand(
1644       PtrInfo,
1645       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1646           MachineMemOperand::MOInvariant,
1647       4, commonAlignment(Align(64), StructOffset));
1648 
1649   Register LoadAddr;
1650 
1651   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1652   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1653 }
1654 
1655 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1656   MachineInstr &MI, MachineRegisterInfo &MRI,
1657   MachineIRBuilder &B) const {
1658   MachineFunction &MF = B.getMF();
1659 
1660   const LLT S32 = LLT::scalar(32);
1661   Register Dst = MI.getOperand(0).getReg();
1662   Register Src = MI.getOperand(1).getReg();
1663 
1664   LLT DstTy = MRI.getType(Dst);
1665   LLT SrcTy = MRI.getType(Src);
1666   unsigned DestAS = DstTy.getAddressSpace();
1667   unsigned SrcAS = SrcTy.getAddressSpace();
1668 
1669   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1670   // vector element.
1671   assert(!DstTy.isVector());
1672 
1673   const AMDGPUTargetMachine &TM
1674     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1675 
1676   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1677   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1678     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1679     return true;
1680   }
1681 
1682   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1683     // Truncate.
1684     B.buildExtract(Dst, Src, 0);
1685     MI.eraseFromParent();
1686     return true;
1687   }
1688 
1689   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1690     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1691     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1692 
1693     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1694     // another. Merge operands are required to be the same type, but creating an
1695     // extra ptrtoint would be kind of pointless.
1696     auto HighAddr = B.buildConstant(
1697       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1698     B.buildMerge(Dst, {Src, HighAddr});
1699     MI.eraseFromParent();
1700     return true;
1701   }
1702 
1703   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1704     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1705            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1706     unsigned NullVal = TM.getNullPointerValue(DestAS);
1707 
1708     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1709     auto FlatNull = B.buildConstant(SrcTy, 0);
1710 
1711     // Extract low 32-bits of the pointer.
1712     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1713 
1714     auto CmpRes =
1715         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1716     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1717 
1718     MI.eraseFromParent();
1719     return true;
1720   }
1721 
1722   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1723     return false;
1724 
1725   if (!ST.hasFlatAddressSpace())
1726     return false;
1727 
1728   auto SegmentNull =
1729       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1730   auto FlatNull =
1731       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1732 
1733   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1734   if (!ApertureReg.isValid())
1735     return false;
1736 
1737   auto CmpRes =
1738       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1739 
1740   // Coerce the type of the low half of the result so we can use merge_values.
1741   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1742 
1743   // TODO: Should we allow mismatched types but matching sizes in merges to
1744   // avoid the ptrtoint?
1745   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1746   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1747 
1748   MI.eraseFromParent();
1749   return true;
1750 }
1751 
1752 bool AMDGPULegalizerInfo::legalizeFrint(
1753   MachineInstr &MI, MachineRegisterInfo &MRI,
1754   MachineIRBuilder &B) const {
1755   Register Src = MI.getOperand(1).getReg();
1756   LLT Ty = MRI.getType(Src);
1757   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1758 
1759   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1760   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1761 
1762   auto C1 = B.buildFConstant(Ty, C1Val);
1763   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1764 
1765   // TODO: Should this propagate fast-math-flags?
1766   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1767   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1768 
1769   auto C2 = B.buildFConstant(Ty, C2Val);
1770   auto Fabs = B.buildFAbs(Ty, Src);
1771 
1772   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1773   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1774   MI.eraseFromParent();
1775   return true;
1776 }
1777 
1778 bool AMDGPULegalizerInfo::legalizeFceil(
1779   MachineInstr &MI, MachineRegisterInfo &MRI,
1780   MachineIRBuilder &B) const {
1781 
1782   const LLT S1 = LLT::scalar(1);
1783   const LLT S64 = LLT::scalar(64);
1784 
1785   Register Src = MI.getOperand(1).getReg();
1786   assert(MRI.getType(Src) == S64);
1787 
1788   // result = trunc(src)
1789   // if (src > 0.0 && src != result)
1790   //   result += 1.0
1791 
1792   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1793 
1794   const auto Zero = B.buildFConstant(S64, 0.0);
1795   const auto One = B.buildFConstant(S64, 1.0);
1796   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1797   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1798   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1799   auto Add = B.buildSelect(S64, And, One, Zero);
1800 
1801   // TODO: Should this propagate fast-math-flags?
1802   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1803   return true;
1804 }
1805 
1806 static MachineInstrBuilder extractF64Exponent(Register Hi,
1807                                               MachineIRBuilder &B) {
1808   const unsigned FractBits = 52;
1809   const unsigned ExpBits = 11;
1810   LLT S32 = LLT::scalar(32);
1811 
1812   auto Const0 = B.buildConstant(S32, FractBits - 32);
1813   auto Const1 = B.buildConstant(S32, ExpBits);
1814 
1815   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1816     .addUse(Hi)
1817     .addUse(Const0.getReg(0))
1818     .addUse(Const1.getReg(0));
1819 
1820   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1821 }
1822 
1823 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1824   MachineInstr &MI, MachineRegisterInfo &MRI,
1825   MachineIRBuilder &B) const {
1826   const LLT S1 = LLT::scalar(1);
1827   const LLT S32 = LLT::scalar(32);
1828   const LLT S64 = LLT::scalar(64);
1829 
1830   Register Src = MI.getOperand(1).getReg();
1831   assert(MRI.getType(Src) == S64);
1832 
1833   // TODO: Should this use extract since the low half is unused?
1834   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1835   Register Hi = Unmerge.getReg(1);
1836 
1837   // Extract the upper half, since this is where we will find the sign and
1838   // exponent.
1839   auto Exp = extractF64Exponent(Hi, B);
1840 
1841   const unsigned FractBits = 52;
1842 
1843   // Extract the sign bit.
1844   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1845   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1846 
1847   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1848 
1849   const auto Zero32 = B.buildConstant(S32, 0);
1850 
1851   // Extend back to 64-bits.
1852   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1853 
1854   auto Shr = B.buildAShr(S64, FractMask, Exp);
1855   auto Not = B.buildNot(S64, Shr);
1856   auto Tmp0 = B.buildAnd(S64, Src, Not);
1857   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1858 
1859   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1860   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1861 
1862   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1863   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1864   MI.eraseFromParent();
1865   return true;
1866 }
1867 
1868 bool AMDGPULegalizerInfo::legalizeITOFP(
1869   MachineInstr &MI, MachineRegisterInfo &MRI,
1870   MachineIRBuilder &B, bool Signed) const {
1871 
1872   Register Dst = MI.getOperand(0).getReg();
1873   Register Src = MI.getOperand(1).getReg();
1874 
1875   const LLT S64 = LLT::scalar(64);
1876   const LLT S32 = LLT::scalar(32);
1877 
1878   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1879 
1880   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1881 
1882   auto CvtHi = Signed ?
1883     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1884     B.buildUITOFP(S64, Unmerge.getReg(1));
1885 
1886   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1887 
1888   auto ThirtyTwo = B.buildConstant(S32, 32);
1889   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1890     .addUse(CvtHi.getReg(0))
1891     .addUse(ThirtyTwo.getReg(0));
1892 
1893   // TODO: Should this propagate fast-math-flags?
1894   B.buildFAdd(Dst, LdExp, CvtLo);
1895   MI.eraseFromParent();
1896   return true;
1897 }
1898 
1899 // TODO: Copied from DAG implementation. Verify logic and document how this
1900 // actually works.
1901 bool AMDGPULegalizerInfo::legalizeFPTOI(
1902   MachineInstr &MI, MachineRegisterInfo &MRI,
1903   MachineIRBuilder &B, bool Signed) const {
1904 
1905   Register Dst = MI.getOperand(0).getReg();
1906   Register Src = MI.getOperand(1).getReg();
1907 
1908   const LLT S64 = LLT::scalar(64);
1909   const LLT S32 = LLT::scalar(32);
1910 
1911   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1912 
1913   unsigned Flags = MI.getFlags();
1914 
1915   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1916   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1917   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1918 
1919   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1920   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1921   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1922 
1923   auto Hi = Signed ?
1924     B.buildFPTOSI(S32, FloorMul) :
1925     B.buildFPTOUI(S32, FloorMul);
1926   auto Lo = B.buildFPTOUI(S32, Fma);
1927 
1928   B.buildMerge(Dst, { Lo, Hi });
1929   MI.eraseFromParent();
1930 
1931   return true;
1932 }
1933 
1934 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1935                                                MachineInstr &MI) const {
1936   MachineFunction &MF = Helper.MIRBuilder.getMF();
1937   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1938 
1939   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1940                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1941 
1942   // With ieee_mode disabled, the instructions have the correct behavior
1943   // already for G_FMINNUM/G_FMAXNUM
1944   if (!MFI->getMode().IEEE)
1945     return !IsIEEEOp;
1946 
1947   if (IsIEEEOp)
1948     return true;
1949 
1950   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1951 }
1952 
1953 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1954   MachineInstr &MI, MachineRegisterInfo &MRI,
1955   MachineIRBuilder &B) const {
1956   // TODO: Should move some of this into LegalizerHelper.
1957 
1958   // TODO: Promote dynamic indexing of s16 to s32
1959 
1960   // FIXME: Artifact combiner probably should have replaced the truncated
1961   // constant before this, so we shouldn't need
1962   // getConstantVRegValWithLookThrough.
1963   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1964     MI.getOperand(2).getReg(), MRI);
1965   if (!IdxVal) // Dynamic case will be selected to register indexing.
1966     return true;
1967 
1968   Register Dst = MI.getOperand(0).getReg();
1969   Register Vec = MI.getOperand(1).getReg();
1970 
1971   LLT VecTy = MRI.getType(Vec);
1972   LLT EltTy = VecTy.getElementType();
1973   assert(EltTy == MRI.getType(Dst));
1974 
1975   if (IdxVal->Value < VecTy.getNumElements())
1976     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1977   else
1978     B.buildUndef(Dst);
1979 
1980   MI.eraseFromParent();
1981   return true;
1982 }
1983 
1984 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1985   MachineInstr &MI, MachineRegisterInfo &MRI,
1986   MachineIRBuilder &B) const {
1987   // TODO: Should move some of this into LegalizerHelper.
1988 
1989   // TODO: Promote dynamic indexing of s16 to s32
1990 
1991   // FIXME: Artifact combiner probably should have replaced the truncated
1992   // constant before this, so we shouldn't need
1993   // getConstantVRegValWithLookThrough.
1994   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1995     MI.getOperand(3).getReg(), MRI);
1996   if (!IdxVal) // Dynamic case will be selected to register indexing.
1997     return true;
1998 
1999   Register Dst = MI.getOperand(0).getReg();
2000   Register Vec = MI.getOperand(1).getReg();
2001   Register Ins = MI.getOperand(2).getReg();
2002 
2003   LLT VecTy = MRI.getType(Vec);
2004   LLT EltTy = VecTy.getElementType();
2005   assert(EltTy == MRI.getType(Ins));
2006 
2007   if (IdxVal->Value < VecTy.getNumElements())
2008     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2009   else
2010     B.buildUndef(Dst);
2011 
2012   MI.eraseFromParent();
2013   return true;
2014 }
2015 
2016 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2017   MachineInstr &MI, MachineRegisterInfo &MRI,
2018   MachineIRBuilder &B) const {
2019   const LLT V2S16 = LLT::vector(2, 16);
2020 
2021   Register Dst = MI.getOperand(0).getReg();
2022   Register Src0 = MI.getOperand(1).getReg();
2023   LLT DstTy = MRI.getType(Dst);
2024   LLT SrcTy = MRI.getType(Src0);
2025 
2026   if (SrcTy == V2S16 && DstTy == V2S16 &&
2027       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2028     return true;
2029 
2030   MachineIRBuilder HelperBuilder(MI);
2031   GISelObserverWrapper DummyObserver;
2032   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2033   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2034 }
2035 
2036 bool AMDGPULegalizerInfo::legalizeSinCos(
2037   MachineInstr &MI, MachineRegisterInfo &MRI,
2038   MachineIRBuilder &B) const {
2039 
2040   Register DstReg = MI.getOperand(0).getReg();
2041   Register SrcReg = MI.getOperand(1).getReg();
2042   LLT Ty = MRI.getType(DstReg);
2043   unsigned Flags = MI.getFlags();
2044 
2045   Register TrigVal;
2046   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2047   if (ST.hasTrigReducedRange()) {
2048     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2049     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2050       .addUse(MulVal.getReg(0))
2051       .setMIFlags(Flags).getReg(0);
2052   } else
2053     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2054 
2055   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2056     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2057   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2058     .addUse(TrigVal)
2059     .setMIFlags(Flags);
2060   MI.eraseFromParent();
2061   return true;
2062 }
2063 
2064 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2065                                                   MachineIRBuilder &B,
2066                                                   const GlobalValue *GV,
2067                                                   int64_t Offset,
2068                                                   unsigned GAFlags) const {
2069   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2070   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2071   // to the following code sequence:
2072   //
2073   // For constant address space:
2074   //   s_getpc_b64 s[0:1]
2075   //   s_add_u32 s0, s0, $symbol
2076   //   s_addc_u32 s1, s1, 0
2077   //
2078   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2079   //   a fixup or relocation is emitted to replace $symbol with a literal
2080   //   constant, which is a pc-relative offset from the encoding of the $symbol
2081   //   operand to the global variable.
2082   //
2083   // For global address space:
2084   //   s_getpc_b64 s[0:1]
2085   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2086   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2087   //
2088   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2089   //   fixups or relocations are emitted to replace $symbol@*@lo and
2090   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2091   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2092   //   operand to the global variable.
2093   //
2094   // What we want here is an offset from the value returned by s_getpc
2095   // (which is the address of the s_add_u32 instruction) to the global
2096   // variable, but since the encoding of $symbol starts 4 bytes after the start
2097   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2098   // small. This requires us to add 4 to the global variable offset in order to
2099   // compute the correct address.
2100 
2101   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2102 
2103   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2104     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2105 
2106   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2107     .addDef(PCReg);
2108 
2109   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2110   if (GAFlags == SIInstrInfo::MO_NONE)
2111     MIB.addImm(0);
2112   else
2113     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2114 
2115   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2116 
2117   if (PtrTy.getSizeInBits() == 32)
2118     B.buildExtract(DstReg, PCReg, 0);
2119   return true;
2120  }
2121 
2122 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2123   MachineInstr &MI, MachineRegisterInfo &MRI,
2124   MachineIRBuilder &B) const {
2125   Register DstReg = MI.getOperand(0).getReg();
2126   LLT Ty = MRI.getType(DstReg);
2127   unsigned AS = Ty.getAddressSpace();
2128 
2129   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2130   MachineFunction &MF = B.getMF();
2131   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2132 
2133   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2134     if (!MFI->isEntryFunction()) {
2135       const Function &Fn = MF.getFunction();
2136       DiagnosticInfoUnsupported BadLDSDecl(
2137         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2138         DS_Warning);
2139       Fn.getContext().diagnose(BadLDSDecl);
2140 
2141       // We currently don't have a way to correctly allocate LDS objects that
2142       // aren't directly associated with a kernel. We do force inlining of
2143       // functions that use local objects. However, if these dead functions are
2144       // not eliminated, we don't want a compile time error. Just emit a warning
2145       // and a trap, since there should be no callable path here.
2146       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2147       B.buildUndef(DstReg);
2148       MI.eraseFromParent();
2149       return true;
2150     }
2151 
2152     // TODO: We could emit code to handle the initialization somewhere.
2153     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2154       const SITargetLowering *TLI = ST.getTargetLowering();
2155       if (!TLI->shouldUseLDSConstAddress(GV)) {
2156         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2157         return true; // Leave in place;
2158       }
2159 
2160       B.buildConstant(
2161           DstReg,
2162           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2163       MI.eraseFromParent();
2164       return true;
2165     }
2166 
2167     const Function &Fn = MF.getFunction();
2168     DiagnosticInfoUnsupported BadInit(
2169       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2170     Fn.getContext().diagnose(BadInit);
2171     return true;
2172   }
2173 
2174   const SITargetLowering *TLI = ST.getTargetLowering();
2175 
2176   if (TLI->shouldEmitFixup(GV)) {
2177     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2178     MI.eraseFromParent();
2179     return true;
2180   }
2181 
2182   if (TLI->shouldEmitPCReloc(GV)) {
2183     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2184     MI.eraseFromParent();
2185     return true;
2186   }
2187 
2188   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2189   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2190 
2191   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2192       MachinePointerInfo::getGOT(MF),
2193       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2194           MachineMemOperand::MOInvariant,
2195       8 /*Size*/, Align(8));
2196 
2197   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2198 
2199   if (Ty.getSizeInBits() == 32) {
2200     // Truncate if this is a 32-bit constant adrdess.
2201     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2202     B.buildExtract(DstReg, Load, 0);
2203   } else
2204     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2205 
2206   MI.eraseFromParent();
2207   return true;
2208 }
2209 
2210 bool AMDGPULegalizerInfo::legalizeLoad(
2211   MachineInstr &MI, MachineRegisterInfo &MRI,
2212   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2213   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2214   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2215   Observer.changingInstr(MI);
2216   MI.getOperand(1).setReg(Cast.getReg(0));
2217   Observer.changedInstr(MI);
2218   return true;
2219 }
2220 
2221 bool AMDGPULegalizerInfo::legalizeFMad(
2222   MachineInstr &MI, MachineRegisterInfo &MRI,
2223   MachineIRBuilder &B) const {
2224   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2225   assert(Ty.isScalar());
2226 
2227   MachineFunction &MF = B.getMF();
2228   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2229 
2230   // TODO: Always legal with future ftz flag.
2231   // FIXME: Do we need just output?
2232   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2233     return true;
2234   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2235     return true;
2236 
2237   MachineIRBuilder HelperBuilder(MI);
2238   GISelObserverWrapper DummyObserver;
2239   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2240   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2241 }
2242 
2243 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2244   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2245   Register DstReg = MI.getOperand(0).getReg();
2246   Register PtrReg = MI.getOperand(1).getReg();
2247   Register CmpVal = MI.getOperand(2).getReg();
2248   Register NewVal = MI.getOperand(3).getReg();
2249 
2250   assert(SITargetLowering::isFlatGlobalAddrSpace(
2251            MRI.getType(PtrReg).getAddressSpace()) &&
2252          "this should not have been custom lowered");
2253 
2254   LLT ValTy = MRI.getType(CmpVal);
2255   LLT VecTy = LLT::vector(2, ValTy);
2256 
2257   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2258 
2259   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2260     .addDef(DstReg)
2261     .addUse(PtrReg)
2262     .addUse(PackedVal)
2263     .setMemRefs(MI.memoperands());
2264 
2265   MI.eraseFromParent();
2266   return true;
2267 }
2268 
2269 bool AMDGPULegalizerInfo::legalizeFlog(
2270   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2271   Register Dst = MI.getOperand(0).getReg();
2272   Register Src = MI.getOperand(1).getReg();
2273   LLT Ty = B.getMRI()->getType(Dst);
2274   unsigned Flags = MI.getFlags();
2275 
2276   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2277   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2278 
2279   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2280   MI.eraseFromParent();
2281   return true;
2282 }
2283 
2284 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2285                                        MachineIRBuilder &B) const {
2286   Register Dst = MI.getOperand(0).getReg();
2287   Register Src = MI.getOperand(1).getReg();
2288   unsigned Flags = MI.getFlags();
2289   LLT Ty = B.getMRI()->getType(Dst);
2290 
2291   auto K = B.buildFConstant(Ty, numbers::log2e);
2292   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2293   B.buildFExp2(Dst, Mul, Flags);
2294   MI.eraseFromParent();
2295   return true;
2296 }
2297 
2298 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2299                                        MachineIRBuilder &B) const {
2300   Register Dst = MI.getOperand(0).getReg();
2301   Register Src0 = MI.getOperand(1).getReg();
2302   Register Src1 = MI.getOperand(2).getReg();
2303   unsigned Flags = MI.getFlags();
2304   LLT Ty = B.getMRI()->getType(Dst);
2305   const LLT S16 = LLT::scalar(16);
2306   const LLT S32 = LLT::scalar(32);
2307 
2308   if (Ty == S32) {
2309     auto Log = B.buildFLog2(S32, Src0, Flags);
2310     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2311       .addUse(Log.getReg(0))
2312       .addUse(Src1)
2313       .setMIFlags(Flags);
2314     B.buildFExp2(Dst, Mul, Flags);
2315   } else if (Ty == S16) {
2316     // There's no f16 fmul_legacy, so we need to convert for it.
2317     auto Log = B.buildFLog2(S16, Src0, Flags);
2318     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2319     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2320     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2321       .addUse(Ext0.getReg(0))
2322       .addUse(Ext1.getReg(0))
2323       .setMIFlags(Flags);
2324 
2325     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2326   } else
2327     return false;
2328 
2329   MI.eraseFromParent();
2330   return true;
2331 }
2332 
2333 // Find a source register, ignoring any possible source modifiers.
2334 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2335   Register ModSrc = OrigSrc;
2336   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2337     ModSrc = SrcFNeg->getOperand(1).getReg();
2338     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2339       ModSrc = SrcFAbs->getOperand(1).getReg();
2340   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2341     ModSrc = SrcFAbs->getOperand(1).getReg();
2342   return ModSrc;
2343 }
2344 
2345 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2346                                          MachineRegisterInfo &MRI,
2347                                          MachineIRBuilder &B) const {
2348 
2349   const LLT S1 = LLT::scalar(1);
2350   const LLT S64 = LLT::scalar(64);
2351   Register Dst = MI.getOperand(0).getReg();
2352   Register OrigSrc = MI.getOperand(1).getReg();
2353   unsigned Flags = MI.getFlags();
2354   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2355          "this should not have been custom lowered");
2356 
2357   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2358   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2359   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2360   // V_FRACT bug is:
2361   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2362   //
2363   // Convert floor(x) to (x - fract(x))
2364 
2365   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2366     .addUse(OrigSrc)
2367     .setMIFlags(Flags);
2368 
2369   // Give source modifier matching some assistance before obscuring a foldable
2370   // pattern.
2371 
2372   // TODO: We can avoid the neg on the fract? The input sign to fract
2373   // shouldn't matter?
2374   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2375 
2376   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2377 
2378   Register Min = MRI.createGenericVirtualRegister(S64);
2379 
2380   // We don't need to concern ourselves with the snan handling difference, so
2381   // use the one which will directly select.
2382   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2383   if (MFI->getMode().IEEE)
2384     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2385   else
2386     B.buildFMinNum(Min, Fract, Const, Flags);
2387 
2388   Register CorrectedFract = Min;
2389   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2390     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2391     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2392   }
2393 
2394   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2395   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2396 
2397   MI.eraseFromParent();
2398   return true;
2399 }
2400 
2401 // Turn an illegal packed v2s16 build vector into bit operations.
2402 // TODO: This should probably be a bitcast action in LegalizerHelper.
2403 bool AMDGPULegalizerInfo::legalizeBuildVector(
2404   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2405   Register Dst = MI.getOperand(0).getReg();
2406   const LLT S32 = LLT::scalar(32);
2407   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2408 
2409   Register Src0 = MI.getOperand(1).getReg();
2410   Register Src1 = MI.getOperand(2).getReg();
2411   assert(MRI.getType(Src0) == LLT::scalar(16));
2412 
2413   auto Merge = B.buildMerge(S32, {Src0, Src1});
2414   B.buildBitcast(Dst, Merge);
2415 
2416   MI.eraseFromParent();
2417   return true;
2418 }
2419 
2420 // Return the use branch instruction, otherwise null if the usage is invalid.
2421 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2422                                        MachineRegisterInfo &MRI,
2423                                        MachineInstr *&Br,
2424                                        MachineBasicBlock *&UncondBrTarget) {
2425   Register CondDef = MI.getOperand(0).getReg();
2426   if (!MRI.hasOneNonDBGUse(CondDef))
2427     return nullptr;
2428 
2429   MachineBasicBlock *Parent = MI.getParent();
2430   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2431   if (UseMI.getParent() != Parent ||
2432       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2433     return nullptr;
2434 
2435   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2436   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2437   if (Next == Parent->end()) {
2438     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2439     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2440       return nullptr;
2441     UncondBrTarget = &*NextMBB;
2442   } else {
2443     if (Next->getOpcode() != AMDGPU::G_BR)
2444       return nullptr;
2445     Br = &*Next;
2446     UncondBrTarget = Br->getOperand(0).getMBB();
2447   }
2448 
2449   return &UseMI;
2450 }
2451 
2452 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2453                                                MachineRegisterInfo &MRI,
2454                                                Register LiveIn,
2455                                                Register PhyReg) const {
2456   assert(PhyReg.isPhysical() && "Physical register expected");
2457 
2458   // Insert the live-in copy, if required, by defining destination virtual
2459   // register.
2460   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2461   if (!MRI.getVRegDef(LiveIn)) {
2462     // FIXME: Should have scoped insert pt
2463     MachineBasicBlock &OrigInsBB = B.getMBB();
2464     auto OrigInsPt = B.getInsertPt();
2465 
2466     MachineBasicBlock &EntryMBB = B.getMF().front();
2467     EntryMBB.addLiveIn(PhyReg);
2468     B.setInsertPt(EntryMBB, EntryMBB.begin());
2469     B.buildCopy(LiveIn, PhyReg);
2470 
2471     B.setInsertPt(OrigInsBB, OrigInsPt);
2472   }
2473 
2474   return LiveIn;
2475 }
2476 
2477 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2478                                                 MachineRegisterInfo &MRI,
2479                                                 Register PhyReg, LLT Ty,
2480                                                 bool InsertLiveInCopy) const {
2481   assert(PhyReg.isPhysical() && "Physical register expected");
2482 
2483   // Get or create virtual live-in regester
2484   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2485   if (!LiveIn) {
2486     LiveIn = MRI.createGenericVirtualRegister(Ty);
2487     MRI.addLiveIn(PhyReg, LiveIn);
2488   }
2489 
2490   // When the actual true copy required is from virtual register to physical
2491   // register (to be inserted later), live-in copy insertion from physical
2492   // to register virtual register is not required
2493   if (!InsertLiveInCopy)
2494     return LiveIn;
2495 
2496   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2497 }
2498 
2499 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2500     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2501   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2502   const ArgDescriptor *Arg;
2503   const TargetRegisterClass *RC;
2504   LLT ArgTy;
2505   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2506   if (!Arg) {
2507     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2508     return nullptr;
2509   }
2510   return Arg;
2511 }
2512 
2513 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2514                                          const ArgDescriptor *Arg) const {
2515   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2516     return false; // TODO: Handle these
2517 
2518   Register SrcReg = Arg->getRegister();
2519   assert(SrcReg.isPhysical() && "Physical register expected");
2520   assert(DstReg.isVirtual() && "Virtual register expected");
2521 
2522   MachineRegisterInfo &MRI = *B.getMRI();
2523 
2524   LLT Ty = MRI.getType(DstReg);
2525   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2526 
2527   if (Arg->isMasked()) {
2528     // TODO: Should we try to emit this once in the entry block?
2529     const LLT S32 = LLT::scalar(32);
2530     const unsigned Mask = Arg->getMask();
2531     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2532 
2533     Register AndMaskSrc = LiveIn;
2534 
2535     if (Shift != 0) {
2536       auto ShiftAmt = B.buildConstant(S32, Shift);
2537       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2538     }
2539 
2540     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2541   } else {
2542     B.buildCopy(DstReg, LiveIn);
2543   }
2544 
2545   return true;
2546 }
2547 
2548 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2549     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2550     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2551 
2552   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2553   if (!Arg)
2554     return false;
2555 
2556   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2557     return false;
2558 
2559   MI.eraseFromParent();
2560   return true;
2561 }
2562 
2563 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2564                                        MachineRegisterInfo &MRI,
2565                                        MachineIRBuilder &B) const {
2566   Register Dst = MI.getOperand(0).getReg();
2567   LLT DstTy = MRI.getType(Dst);
2568   LLT S16 = LLT::scalar(16);
2569   LLT S32 = LLT::scalar(32);
2570   LLT S64 = LLT::scalar(64);
2571 
2572   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2573     return true;
2574 
2575   if (DstTy == S16)
2576     return legalizeFDIV16(MI, MRI, B);
2577   if (DstTy == S32)
2578     return legalizeFDIV32(MI, MRI, B);
2579   if (DstTy == S64)
2580     return legalizeFDIV64(MI, MRI, B);
2581 
2582   return false;
2583 }
2584 
2585 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2586                                                   Register DstReg,
2587                                                   Register X,
2588                                                   Register Y,
2589                                                   bool IsDiv) const {
2590   const LLT S1 = LLT::scalar(1);
2591   const LLT S32 = LLT::scalar(32);
2592 
2593   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2594   // algorithm used here.
2595 
2596   // Initial estimate of inv(y).
2597   auto FloatY = B.buildUITOFP(S32, Y);
2598   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2599   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2600   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2601   auto Z = B.buildFPTOUI(S32, ScaledY);
2602 
2603   // One round of UNR.
2604   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2605   auto NegYZ = B.buildMul(S32, NegY, Z);
2606   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2607 
2608   // Quotient/remainder estimate.
2609   auto Q = B.buildUMulH(S32, X, Z);
2610   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2611 
2612   // First quotient/remainder refinement.
2613   auto One = B.buildConstant(S32, 1);
2614   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2615   if (IsDiv)
2616     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2617   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2618 
2619   // Second quotient/remainder refinement.
2620   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2621   if (IsDiv)
2622     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2623   else
2624     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2625 }
2626 
2627 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2628                                               MachineRegisterInfo &MRI,
2629                                               MachineIRBuilder &B) const {
2630   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2631   Register DstReg = MI.getOperand(0).getReg();
2632   Register Num = MI.getOperand(1).getReg();
2633   Register Den = MI.getOperand(2).getReg();
2634   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2635   MI.eraseFromParent();
2636   return true;
2637 }
2638 
2639 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2640 //
2641 // Return lo, hi of result
2642 //
2643 // %cvt.lo = G_UITOFP Val.lo
2644 // %cvt.hi = G_UITOFP Val.hi
2645 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2646 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2647 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2648 // %mul2 = G_FMUL %mul1, 2**(-32)
2649 // %trunc = G_INTRINSIC_TRUNC %mul2
2650 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2651 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2652 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2653                                                        Register Val) {
2654   const LLT S32 = LLT::scalar(32);
2655   auto Unmerge = B.buildUnmerge(S32, Val);
2656 
2657   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2658   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2659 
2660   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2661                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2662 
2663   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2664   auto Mul1 =
2665       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2666 
2667   // 2**(-32)
2668   auto Mul2 =
2669       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2670   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2671 
2672   // -(2**32)
2673   auto Mad2 = B.buildFMAD(S32, Trunc,
2674                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2675 
2676   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2677   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2678 
2679   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2680 }
2681 
2682 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2683                                                   Register DstReg,
2684                                                   Register Numer,
2685                                                   Register Denom,
2686                                                   bool IsDiv) const {
2687   const LLT S32 = LLT::scalar(32);
2688   const LLT S64 = LLT::scalar(64);
2689   const LLT S1 = LLT::scalar(1);
2690   Register RcpLo, RcpHi;
2691 
2692   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2693 
2694   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2695 
2696   auto Zero64 = B.buildConstant(S64, 0);
2697   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2698 
2699   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2700   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2701 
2702   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2703   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2704   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2705 
2706   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2707   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2708   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2709   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2710 
2711   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2712   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2713   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2714   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2715   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2716 
2717   auto Zero32 = B.buildConstant(S32, 0);
2718   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2719   auto Add2_HiC =
2720       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2721   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2722   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2723 
2724   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2725   Register NumerLo = UnmergeNumer.getReg(0);
2726   Register NumerHi = UnmergeNumer.getReg(1);
2727 
2728   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2729   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2730   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2731   Register Mul3_Lo = UnmergeMul3.getReg(0);
2732   Register Mul3_Hi = UnmergeMul3.getReg(1);
2733   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2734   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2735   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2736   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2737 
2738   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2739   Register DenomLo = UnmergeDenom.getReg(0);
2740   Register DenomHi = UnmergeDenom.getReg(1);
2741 
2742   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2743   auto C1 = B.buildSExt(S32, CmpHi);
2744 
2745   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2746   auto C2 = B.buildSExt(S32, CmpLo);
2747 
2748   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2749   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2750 
2751   // TODO: Here and below portions of the code can be enclosed into if/endif.
2752   // Currently control flow is unconditional and we have 4 selects after
2753   // potential endif to substitute PHIs.
2754 
2755   // if C3 != 0 ...
2756   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2757   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2758   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2759   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2760 
2761   auto One64 = B.buildConstant(S64, 1);
2762   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2763 
2764   auto C4 =
2765       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2766   auto C5 =
2767       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2768   auto C6 = B.buildSelect(
2769       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2770 
2771   // if (C6 != 0)
2772   auto Add4 = B.buildAdd(S64, Add3, One64);
2773   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2774 
2775   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2776   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2777   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2778 
2779   // endif C6
2780   // endif C3
2781 
2782   if (IsDiv) {
2783     auto Sel1 = B.buildSelect(
2784         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2785     B.buildSelect(DstReg,
2786                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2787   } else {
2788     auto Sel2 = B.buildSelect(
2789         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2790     B.buildSelect(DstReg,
2791                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2792   }
2793 }
2794 
2795 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2796                                             MachineRegisterInfo &MRI,
2797                                             MachineIRBuilder &B) const {
2798   const LLT S64 = LLT::scalar(64);
2799   const LLT S32 = LLT::scalar(32);
2800   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2801   Register DstReg = MI.getOperand(0).getReg();
2802   Register Num = MI.getOperand(1).getReg();
2803   Register Den = MI.getOperand(2).getReg();
2804   LLT Ty = MRI.getType(DstReg);
2805 
2806   if (Ty == S32)
2807     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2808   else if (Ty == S64)
2809     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2810   else
2811     return false;
2812 
2813   MI.eraseFromParent();
2814   return true;
2815 
2816 }
2817 
2818 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2819                                             MachineRegisterInfo &MRI,
2820                                             MachineIRBuilder &B) const {
2821   const LLT S64 = LLT::scalar(64);
2822   const LLT S32 = LLT::scalar(32);
2823 
2824   Register DstReg = MI.getOperand(0).getReg();
2825   const LLT Ty = MRI.getType(DstReg);
2826   if (Ty != S32 && Ty != S64)
2827     return false;
2828 
2829   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2830 
2831   Register LHS = MI.getOperand(1).getReg();
2832   Register RHS = MI.getOperand(2).getReg();
2833 
2834   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2835   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2836   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2837 
2838   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2839   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2840 
2841   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2842   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2843 
2844   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2845   if (Ty == S32)
2846     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2847   else
2848     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2849 
2850   Register Sign;
2851   if (IsDiv)
2852     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2853   else
2854     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2855 
2856   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2857   B.buildSub(DstReg, UDivRem, Sign);
2858 
2859   MI.eraseFromParent();
2860   return true;
2861 }
2862 
2863 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2864                                                  MachineRegisterInfo &MRI,
2865                                                  MachineIRBuilder &B) const {
2866   Register Res = MI.getOperand(0).getReg();
2867   Register LHS = MI.getOperand(1).getReg();
2868   Register RHS = MI.getOperand(2).getReg();
2869 
2870   uint16_t Flags = MI.getFlags();
2871 
2872   LLT ResTy = MRI.getType(Res);
2873   LLT S32 = LLT::scalar(32);
2874   LLT S64 = LLT::scalar(64);
2875 
2876   const MachineFunction &MF = B.getMF();
2877   bool Unsafe =
2878     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2879 
2880   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2881     return false;
2882 
2883   if (!Unsafe && ResTy == S32 &&
2884       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2885     return false;
2886 
2887   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2888     // 1 / x -> RCP(x)
2889     if (CLHS->isExactlyValue(1.0)) {
2890       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2891         .addUse(RHS)
2892         .setMIFlags(Flags);
2893 
2894       MI.eraseFromParent();
2895       return true;
2896     }
2897 
2898     // -1 / x -> RCP( FNEG(x) )
2899     if (CLHS->isExactlyValue(-1.0)) {
2900       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2901       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2902         .addUse(FNeg.getReg(0))
2903         .setMIFlags(Flags);
2904 
2905       MI.eraseFromParent();
2906       return true;
2907     }
2908   }
2909 
2910   // x / y -> x * (1.0 / y)
2911   if (Unsafe) {
2912     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2913       .addUse(RHS)
2914       .setMIFlags(Flags);
2915     B.buildFMul(Res, LHS, RCP, Flags);
2916 
2917     MI.eraseFromParent();
2918     return true;
2919   }
2920 
2921   return false;
2922 }
2923 
2924 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2925                                          MachineRegisterInfo &MRI,
2926                                          MachineIRBuilder &B) const {
2927   Register Res = MI.getOperand(0).getReg();
2928   Register LHS = MI.getOperand(1).getReg();
2929   Register RHS = MI.getOperand(2).getReg();
2930 
2931   uint16_t Flags = MI.getFlags();
2932 
2933   LLT S16 = LLT::scalar(16);
2934   LLT S32 = LLT::scalar(32);
2935 
2936   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2937   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2938 
2939   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2940     .addUse(RHSExt.getReg(0))
2941     .setMIFlags(Flags);
2942 
2943   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2944   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2945 
2946   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2947     .addUse(RDst.getReg(0))
2948     .addUse(RHS)
2949     .addUse(LHS)
2950     .setMIFlags(Flags);
2951 
2952   MI.eraseFromParent();
2953   return true;
2954 }
2955 
2956 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2957 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2958 static void toggleSPDenormMode(bool Enable,
2959                                MachineIRBuilder &B,
2960                                const GCNSubtarget &ST,
2961                                AMDGPU::SIModeRegisterDefaults Mode) {
2962   // Set SP denorm mode to this value.
2963   unsigned SPDenormMode =
2964     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2965 
2966   if (ST.hasDenormModeInst()) {
2967     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2968     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2969 
2970     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2971     B.buildInstr(AMDGPU::S_DENORM_MODE)
2972       .addImm(NewDenormModeValue);
2973 
2974   } else {
2975     // Select FP32 bit field in mode register.
2976     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2977                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2978                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2979 
2980     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2981       .addImm(SPDenormMode)
2982       .addImm(SPDenormModeBitField);
2983   }
2984 }
2985 
2986 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2987                                          MachineRegisterInfo &MRI,
2988                                          MachineIRBuilder &B) const {
2989   Register Res = MI.getOperand(0).getReg();
2990   Register LHS = MI.getOperand(1).getReg();
2991   Register RHS = MI.getOperand(2).getReg();
2992   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2993   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2994 
2995   uint16_t Flags = MI.getFlags();
2996 
2997   LLT S32 = LLT::scalar(32);
2998   LLT S1 = LLT::scalar(1);
2999 
3000   auto One = B.buildFConstant(S32, 1.0f);
3001 
3002   auto DenominatorScaled =
3003     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3004       .addUse(LHS)
3005       .addUse(RHS)
3006       .addImm(0)
3007       .setMIFlags(Flags);
3008   auto NumeratorScaled =
3009     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3010       .addUse(LHS)
3011       .addUse(RHS)
3012       .addImm(1)
3013       .setMIFlags(Flags);
3014 
3015   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3016     .addUse(DenominatorScaled.getReg(0))
3017     .setMIFlags(Flags);
3018   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3019 
3020   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3021   // aren't modeled as reading it.
3022   if (!Mode.allFP32Denormals())
3023     toggleSPDenormMode(true, B, ST, Mode);
3024 
3025   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3026   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3027   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3028   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3029   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3030   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3031 
3032   if (!Mode.allFP32Denormals())
3033     toggleSPDenormMode(false, B, ST, Mode);
3034 
3035   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3036     .addUse(Fma4.getReg(0))
3037     .addUse(Fma1.getReg(0))
3038     .addUse(Fma3.getReg(0))
3039     .addUse(NumeratorScaled.getReg(1))
3040     .setMIFlags(Flags);
3041 
3042   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3043     .addUse(Fmas.getReg(0))
3044     .addUse(RHS)
3045     .addUse(LHS)
3046     .setMIFlags(Flags);
3047 
3048   MI.eraseFromParent();
3049   return true;
3050 }
3051 
3052 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3053                                          MachineRegisterInfo &MRI,
3054                                          MachineIRBuilder &B) const {
3055   Register Res = MI.getOperand(0).getReg();
3056   Register LHS = MI.getOperand(1).getReg();
3057   Register RHS = MI.getOperand(2).getReg();
3058 
3059   uint16_t Flags = MI.getFlags();
3060 
3061   LLT S64 = LLT::scalar(64);
3062   LLT S1 = LLT::scalar(1);
3063 
3064   auto One = B.buildFConstant(S64, 1.0);
3065 
3066   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3067     .addUse(LHS)
3068     .addUse(RHS)
3069     .addImm(0)
3070     .setMIFlags(Flags);
3071 
3072   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3073 
3074   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3075     .addUse(DivScale0.getReg(0))
3076     .setMIFlags(Flags);
3077 
3078   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3079   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3080   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3081 
3082   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3083     .addUse(LHS)
3084     .addUse(RHS)
3085     .addImm(1)
3086     .setMIFlags(Flags);
3087 
3088   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3089   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3090   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3091 
3092   Register Scale;
3093   if (!ST.hasUsableDivScaleConditionOutput()) {
3094     // Workaround a hardware bug on SI where the condition output from div_scale
3095     // is not usable.
3096 
3097     LLT S32 = LLT::scalar(32);
3098 
3099     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3100     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3101     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3102     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3103 
3104     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3105                               Scale1Unmerge.getReg(1));
3106     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3107                               Scale0Unmerge.getReg(1));
3108     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3109   } else {
3110     Scale = DivScale1.getReg(1);
3111   }
3112 
3113   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3114     .addUse(Fma4.getReg(0))
3115     .addUse(Fma3.getReg(0))
3116     .addUse(Mul.getReg(0))
3117     .addUse(Scale)
3118     .setMIFlags(Flags);
3119 
3120   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3121     .addUse(Fmas.getReg(0))
3122     .addUse(RHS)
3123     .addUse(LHS)
3124     .setMIFlags(Flags);
3125 
3126   MI.eraseFromParent();
3127   return true;
3128 }
3129 
3130 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3131                                                  MachineRegisterInfo &MRI,
3132                                                  MachineIRBuilder &B) const {
3133   Register Res = MI.getOperand(0).getReg();
3134   Register LHS = MI.getOperand(2).getReg();
3135   Register RHS = MI.getOperand(3).getReg();
3136   uint16_t Flags = MI.getFlags();
3137 
3138   LLT S32 = LLT::scalar(32);
3139   LLT S1 = LLT::scalar(1);
3140 
3141   auto Abs = B.buildFAbs(S32, RHS, Flags);
3142   const APFloat C0Val(1.0f);
3143 
3144   auto C0 = B.buildConstant(S32, 0x6f800000);
3145   auto C1 = B.buildConstant(S32, 0x2f800000);
3146   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3147 
3148   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3149   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3150 
3151   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3152 
3153   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3154     .addUse(Mul0.getReg(0))
3155     .setMIFlags(Flags);
3156 
3157   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3158 
3159   B.buildFMul(Res, Sel, Mul1, Flags);
3160 
3161   MI.eraseFromParent();
3162   return true;
3163 }
3164 
3165 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3166                                             MachineRegisterInfo &MRI,
3167                                             MachineIRBuilder &B) const {
3168   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3169   uint64_t Offset =
3170     ST.getTargetLowering()->getImplicitParameterOffset(
3171       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3172   LLT DstTy = MRI.getType(DstReg);
3173   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3174 
3175   const ArgDescriptor *Arg;
3176   const TargetRegisterClass *RC;
3177   LLT ArgTy;
3178   std::tie(Arg, RC, ArgTy) =
3179       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3180   if (!Arg)
3181     return false;
3182 
3183   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3184   if (!loadInputValue(KernargPtrReg, B, Arg))
3185     return false;
3186 
3187   // FIXME: This should be nuw
3188   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3189   return true;
3190 }
3191 
3192 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3193                                                  MachineRegisterInfo &MRI,
3194                                                  MachineIRBuilder &B) const {
3195   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3196   if (!MFI->isEntryFunction()) {
3197     return legalizePreloadedArgIntrin(MI, MRI, B,
3198                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3199   }
3200 
3201   Register DstReg = MI.getOperand(0).getReg();
3202   if (!getImplicitArgPtr(DstReg, MRI, B))
3203     return false;
3204 
3205   MI.eraseFromParent();
3206   return true;
3207 }
3208 
3209 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3210                                               MachineRegisterInfo &MRI,
3211                                               MachineIRBuilder &B,
3212                                               unsigned AddrSpace) const {
3213   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3214   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3215   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3216   MI.eraseFromParent();
3217   return true;
3218 }
3219 
3220 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3221 // offset (the offset that is included in bounds checking and swizzling, to be
3222 // split between the instruction's voffset and immoffset fields) and soffset
3223 // (the offset that is excluded from bounds checking and swizzling, to go in
3224 // the instruction's soffset field).  This function takes the first kind of
3225 // offset and figures out how to split it between voffset and immoffset.
3226 std::tuple<Register, unsigned, unsigned>
3227 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3228                                         Register OrigOffset) const {
3229   const unsigned MaxImm = 4095;
3230   Register BaseReg;
3231   unsigned TotalConstOffset;
3232   MachineInstr *OffsetDef;
3233   const LLT S32 = LLT::scalar(32);
3234 
3235   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3236     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3237 
3238   unsigned ImmOffset = TotalConstOffset;
3239 
3240   // If the immediate value is too big for the immoffset field, put the value
3241   // and -4096 into the immoffset field so that the value that is copied/added
3242   // for the voffset field is a multiple of 4096, and it stands more chance
3243   // of being CSEd with the copy/add for another similar load/store.
3244   // However, do not do that rounding down to a multiple of 4096 if that is a
3245   // negative number, as it appears to be illegal to have a negative offset
3246   // in the vgpr, even if adding the immediate offset makes it positive.
3247   unsigned Overflow = ImmOffset & ~MaxImm;
3248   ImmOffset -= Overflow;
3249   if ((int32_t)Overflow < 0) {
3250     Overflow += ImmOffset;
3251     ImmOffset = 0;
3252   }
3253 
3254   if (Overflow != 0) {
3255     if (!BaseReg) {
3256       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3257     } else {
3258       auto OverflowVal = B.buildConstant(S32, Overflow);
3259       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3260     }
3261   }
3262 
3263   if (!BaseReg)
3264     BaseReg = B.buildConstant(S32, 0).getReg(0);
3265 
3266   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3267 }
3268 
3269 /// Handle register layout difference for f16 images for some subtargets.
3270 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3271                                              MachineRegisterInfo &MRI,
3272                                              Register Reg) const {
3273   if (!ST.hasUnpackedD16VMem())
3274     return Reg;
3275 
3276   const LLT S16 = LLT::scalar(16);
3277   const LLT S32 = LLT::scalar(32);
3278   LLT StoreVT = MRI.getType(Reg);
3279   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3280 
3281   auto Unmerge = B.buildUnmerge(S16, Reg);
3282 
3283   SmallVector<Register, 4> WideRegs;
3284   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3285     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3286 
3287   int NumElts = StoreVT.getNumElements();
3288 
3289   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3290 }
3291 
3292 Register AMDGPULegalizerInfo::fixStoreSourceType(
3293   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3294   MachineRegisterInfo *MRI = B.getMRI();
3295   LLT Ty = MRI->getType(VData);
3296 
3297   const LLT S16 = LLT::scalar(16);
3298 
3299   // Fixup illegal register types for i8 stores.
3300   if (Ty == LLT::scalar(8) || Ty == S16) {
3301     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3302     return AnyExt;
3303   }
3304 
3305   if (Ty.isVector()) {
3306     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3307       if (IsFormat)
3308         return handleD16VData(B, *MRI, VData);
3309     }
3310   }
3311 
3312   return VData;
3313 }
3314 
3315 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3316                                               MachineRegisterInfo &MRI,
3317                                               MachineIRBuilder &B,
3318                                               bool IsTyped,
3319                                               bool IsFormat) const {
3320   Register VData = MI.getOperand(1).getReg();
3321   LLT Ty = MRI.getType(VData);
3322   LLT EltTy = Ty.getScalarType();
3323   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3324   const LLT S32 = LLT::scalar(32);
3325 
3326   VData = fixStoreSourceType(B, VData, IsFormat);
3327   Register RSrc = MI.getOperand(2).getReg();
3328 
3329   MachineMemOperand *MMO = *MI.memoperands_begin();
3330   const int MemSize = MMO->getSize();
3331 
3332   unsigned ImmOffset;
3333   unsigned TotalOffset;
3334 
3335   // The typed intrinsics add an immediate after the registers.
3336   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3337 
3338   // The struct intrinsic variants add one additional operand over raw.
3339   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3340   Register VIndex;
3341   int OpOffset = 0;
3342   if (HasVIndex) {
3343     VIndex = MI.getOperand(3).getReg();
3344     OpOffset = 1;
3345   }
3346 
3347   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3348   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3349 
3350   unsigned Format = 0;
3351   if (IsTyped) {
3352     Format = MI.getOperand(5 + OpOffset).getImm();
3353     ++OpOffset;
3354   }
3355 
3356   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3357 
3358   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3359   if (TotalOffset != 0)
3360     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3361 
3362   unsigned Opc;
3363   if (IsTyped) {
3364     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3365                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3366   } else if (IsFormat) {
3367     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3368                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3369   } else {
3370     switch (MemSize) {
3371     case 1:
3372       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3373       break;
3374     case 2:
3375       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3376       break;
3377     default:
3378       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3379       break;
3380     }
3381   }
3382 
3383   if (!VIndex)
3384     VIndex = B.buildConstant(S32, 0).getReg(0);
3385 
3386   auto MIB = B.buildInstr(Opc)
3387     .addUse(VData)              // vdata
3388     .addUse(RSrc)               // rsrc
3389     .addUse(VIndex)             // vindex
3390     .addUse(VOffset)            // voffset
3391     .addUse(SOffset)            // soffset
3392     .addImm(ImmOffset);         // offset(imm)
3393 
3394   if (IsTyped)
3395     MIB.addImm(Format);
3396 
3397   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3398      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3399      .addMemOperand(MMO);
3400 
3401   MI.eraseFromParent();
3402   return true;
3403 }
3404 
3405 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3406                                              MachineRegisterInfo &MRI,
3407                                              MachineIRBuilder &B,
3408                                              bool IsFormat,
3409                                              bool IsTyped) const {
3410   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3411   MachineMemOperand *MMO = *MI.memoperands_begin();
3412   const int MemSize = MMO->getSize();
3413   const LLT S32 = LLT::scalar(32);
3414 
3415   Register Dst = MI.getOperand(0).getReg();
3416   Register RSrc = MI.getOperand(2).getReg();
3417 
3418   // The typed intrinsics add an immediate after the registers.
3419   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3420 
3421   // The struct intrinsic variants add one additional operand over raw.
3422   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3423   Register VIndex;
3424   int OpOffset = 0;
3425   if (HasVIndex) {
3426     VIndex = MI.getOperand(3).getReg();
3427     OpOffset = 1;
3428   }
3429 
3430   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3431   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3432 
3433   unsigned Format = 0;
3434   if (IsTyped) {
3435     Format = MI.getOperand(5 + OpOffset).getImm();
3436     ++OpOffset;
3437   }
3438 
3439   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3440   unsigned ImmOffset;
3441   unsigned TotalOffset;
3442 
3443   LLT Ty = MRI.getType(Dst);
3444   LLT EltTy = Ty.getScalarType();
3445   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3446   const bool Unpacked = ST.hasUnpackedD16VMem();
3447 
3448   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3449   if (TotalOffset != 0)
3450     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3451 
3452   unsigned Opc;
3453 
3454   if (IsTyped) {
3455     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3456                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3457   } else if (IsFormat) {
3458     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3459                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3460   } else {
3461     switch (MemSize) {
3462     case 1:
3463       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3464       break;
3465     case 2:
3466       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3467       break;
3468     default:
3469       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3470       break;
3471     }
3472   }
3473 
3474   Register LoadDstReg;
3475 
3476   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3477   LLT UnpackedTy = Ty.changeElementSize(32);
3478 
3479   if (IsExtLoad)
3480     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3481   else if (Unpacked && IsD16 && Ty.isVector())
3482     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3483   else
3484     LoadDstReg = Dst;
3485 
3486   if (!VIndex)
3487     VIndex = B.buildConstant(S32, 0).getReg(0);
3488 
3489   auto MIB = B.buildInstr(Opc)
3490     .addDef(LoadDstReg)         // vdata
3491     .addUse(RSrc)               // rsrc
3492     .addUse(VIndex)             // vindex
3493     .addUse(VOffset)            // voffset
3494     .addUse(SOffset)            // soffset
3495     .addImm(ImmOffset);         // offset(imm)
3496 
3497   if (IsTyped)
3498     MIB.addImm(Format);
3499 
3500   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3501      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3502      .addMemOperand(MMO);
3503 
3504   if (LoadDstReg != Dst) {
3505     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3506 
3507     // Widen result for extending loads was widened.
3508     if (IsExtLoad)
3509       B.buildTrunc(Dst, LoadDstReg);
3510     else {
3511       // Repack to original 16-bit vector result
3512       // FIXME: G_TRUNC should work, but legalization currently fails
3513       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3514       SmallVector<Register, 4> Repack;
3515       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3516         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3517       B.buildMerge(Dst, Repack);
3518     }
3519   }
3520 
3521   MI.eraseFromParent();
3522   return true;
3523 }
3524 
3525 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3526                                                MachineIRBuilder &B,
3527                                                bool IsInc) const {
3528   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3529                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3530   B.buildInstr(Opc)
3531     .addDef(MI.getOperand(0).getReg())
3532     .addUse(MI.getOperand(2).getReg())
3533     .addUse(MI.getOperand(3).getReg())
3534     .cloneMemRefs(MI);
3535   MI.eraseFromParent();
3536   return true;
3537 }
3538 
3539 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3540   switch (IntrID) {
3541   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3542   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3543     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3544   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3545   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3546     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3547   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3548   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3549     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3550   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3551   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3552     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3553   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3554   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3555     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3556   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3557   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3558     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3559   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3560   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3561     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3562   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3563   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3564     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3565   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3566   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3567     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3568   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3569   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3570     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3571   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3572   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3573     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3574   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3575   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3576     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3577   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3578   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3579     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3580   default:
3581     llvm_unreachable("unhandled atomic opcode");
3582   }
3583 }
3584 
3585 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3586                                                MachineIRBuilder &B,
3587                                                Intrinsic::ID IID) const {
3588   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3589                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3590 
3591   Register Dst = MI.getOperand(0).getReg();
3592   Register VData = MI.getOperand(2).getReg();
3593 
3594   Register CmpVal;
3595   int OpOffset = 0;
3596 
3597   if (IsCmpSwap) {
3598     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3599     ++OpOffset;
3600   }
3601 
3602   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3603   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3604 
3605   // The struct intrinsic variants add one additional operand over raw.
3606   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3607   Register VIndex;
3608   if (HasVIndex) {
3609     VIndex = MI.getOperand(4 + OpOffset).getReg();
3610     ++OpOffset;
3611   }
3612 
3613   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3614   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3615   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3616 
3617   MachineMemOperand *MMO = *MI.memoperands_begin();
3618 
3619   unsigned ImmOffset;
3620   unsigned TotalOffset;
3621   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3622   if (TotalOffset != 0)
3623     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3624 
3625   if (!VIndex)
3626     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3627 
3628   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3629     .addDef(Dst)
3630     .addUse(VData); // vdata
3631 
3632   if (IsCmpSwap)
3633     MIB.addReg(CmpVal);
3634 
3635   MIB.addUse(RSrc)               // rsrc
3636      .addUse(VIndex)             // vindex
3637      .addUse(VOffset)            // voffset
3638      .addUse(SOffset)            // soffset
3639      .addImm(ImmOffset)          // offset(imm)
3640      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3641      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3642      .addMemOperand(MMO);
3643 
3644   MI.eraseFromParent();
3645   return true;
3646 }
3647 
3648 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3649 /// vector with s16 typed elements.
3650 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3651                                         SmallVectorImpl<Register> &PackedAddrs,
3652                                         int AddrIdx, int DimIdx, int EndIdx,
3653                                         int NumGradients) {
3654   const LLT S16 = LLT::scalar(16);
3655   const LLT V2S16 = LLT::vector(2, 16);
3656 
3657   for (int I = AddrIdx; I < EndIdx; ++I) {
3658     MachineOperand &SrcOp = MI.getOperand(I);
3659     if (!SrcOp.isReg())
3660       continue; // _L to _LZ may have eliminated this.
3661 
3662     Register AddrReg = SrcOp.getReg();
3663 
3664     if (I < DimIdx) {
3665       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3666       PackedAddrs.push_back(AddrReg);
3667     } else {
3668       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3669       // derivatives dx/dh and dx/dv are packed with undef.
3670       if (((I + 1) >= EndIdx) ||
3671           ((NumGradients / 2) % 2 == 1 &&
3672            (I == DimIdx + (NumGradients / 2) - 1 ||
3673             I == DimIdx + NumGradients - 1)) ||
3674           // Check for _L to _LZ optimization
3675           !MI.getOperand(I + 1).isReg()) {
3676         PackedAddrs.push_back(
3677             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3678                 .getReg(0));
3679       } else {
3680         PackedAddrs.push_back(
3681             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3682                 .getReg(0));
3683         ++I;
3684       }
3685     }
3686   }
3687 }
3688 
3689 /// Convert from separate vaddr components to a single vector address register,
3690 /// and replace the remaining operands with $noreg.
3691 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3692                                      int DimIdx, int NumVAddrs) {
3693   const LLT S32 = LLT::scalar(32);
3694 
3695   SmallVector<Register, 8> AddrRegs;
3696   for (int I = 0; I != NumVAddrs; ++I) {
3697     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3698     if (SrcOp.isReg()) {
3699       AddrRegs.push_back(SrcOp.getReg());
3700       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3701     }
3702   }
3703 
3704   int NumAddrRegs = AddrRegs.size();
3705   if (NumAddrRegs != 1) {
3706     // Round up to 8 elements for v5-v7
3707     // FIXME: Missing intermediate sized register classes and instructions.
3708     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3709       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3710       auto Undef = B.buildUndef(S32);
3711       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3712       NumAddrRegs = RoundedNumRegs;
3713     }
3714 
3715     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3716     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3717   }
3718 
3719   for (int I = 1; I != NumVAddrs; ++I) {
3720     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3721     if (SrcOp.isReg())
3722       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3723   }
3724 }
3725 
3726 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3727 ///
3728 /// Depending on the subtarget, load/store with 16-bit element data need to be
3729 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3730 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3731 /// registers.
3732 ///
3733 /// We don't want to directly select image instructions just yet, but also want
3734 /// to exposes all register repacking to the legalizer/combiners. We also don't
3735 /// want a selected instrution entering RegBankSelect. In order to avoid
3736 /// defining a multitude of intermediate image instructions, directly hack on
3737 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3738 /// now unnecessary arguments with $noreg.
3739 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3740     MachineInstr &MI, MachineIRBuilder &B,
3741     GISelChangeObserver &Observer,
3742     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3743 
3744   const int NumDefs = MI.getNumExplicitDefs();
3745   bool IsTFE = NumDefs == 2;
3746   // We are only processing the operands of d16 image operations on subtargets
3747   // that use the unpacked register layout, or need to repack the TFE result.
3748 
3749   // TODO: Do we need to guard against already legalized intrinsics?
3750   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3751     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3752 
3753   MachineRegisterInfo *MRI = B.getMRI();
3754   const LLT S32 = LLT::scalar(32);
3755   const LLT S16 = LLT::scalar(16);
3756   const LLT V2S16 = LLT::vector(2, 16);
3757 
3758   // Index of first address argument
3759   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3760 
3761   int NumVAddrs, NumGradients;
3762   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3763   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3764     getDMaskIdx(BaseOpcode, NumDefs);
3765   unsigned DMask = 0;
3766 
3767   // Check for 16 bit addresses and pack if true.
3768   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3769   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3770   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3771   const bool IsG16 = GradTy == S16;
3772   const bool IsA16 = AddrTy == S16;
3773 
3774   int DMaskLanes = 0;
3775   if (!BaseOpcode->Atomic) {
3776     DMask = MI.getOperand(DMaskIdx).getImm();
3777     if (BaseOpcode->Gather4) {
3778       DMaskLanes = 4;
3779     } else if (DMask != 0) {
3780       DMaskLanes = countPopulation(DMask);
3781     } else if (!IsTFE && !BaseOpcode->Store) {
3782       // If dmask is 0, this is a no-op load. This can be eliminated.
3783       B.buildUndef(MI.getOperand(0));
3784       MI.eraseFromParent();
3785       return true;
3786     }
3787   }
3788 
3789   Observer.changingInstr(MI);
3790   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3791 
3792   unsigned NewOpcode = NumDefs == 0 ?
3793     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3794 
3795   // Track that we legalized this
3796   MI.setDesc(B.getTII().get(NewOpcode));
3797 
3798   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3799   // dmask to be at least 1 otherwise the instruction will fail
3800   if (IsTFE && DMask == 0) {
3801     DMask = 0x1;
3802     DMaskLanes = 1;
3803     MI.getOperand(DMaskIdx).setImm(DMask);
3804   }
3805 
3806   if (BaseOpcode->Atomic) {
3807     Register VData0 = MI.getOperand(2).getReg();
3808     LLT Ty = MRI->getType(VData0);
3809 
3810     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3811     if (Ty.isVector())
3812       return false;
3813 
3814     if (BaseOpcode->AtomicX2) {
3815       Register VData1 = MI.getOperand(3).getReg();
3816       // The two values are packed in one register.
3817       LLT PackedTy = LLT::vector(2, Ty);
3818       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3819       MI.getOperand(2).setReg(Concat.getReg(0));
3820       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3821     }
3822   }
3823 
3824   int CorrectedNumVAddrs = NumVAddrs;
3825 
3826   // Optimize _L to _LZ when _L is zero
3827   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3828         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3829     const ConstantFP *ConstantLod;
3830     const int LodIdx = AddrIdx + NumVAddrs - 1;
3831 
3832     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3833       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3834         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3835         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3836           LZMappingInfo->LZ, ImageDimIntr->Dim);
3837 
3838         // The starting indexes should remain in the same place.
3839         --NumVAddrs;
3840         --CorrectedNumVAddrs;
3841 
3842         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3843           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3844         MI.RemoveOperand(LodIdx);
3845       }
3846     }
3847   }
3848 
3849   // Optimize _mip away, when 'lod' is zero
3850   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3851     int64_t ConstantLod;
3852     const int LodIdx = AddrIdx + NumVAddrs - 1;
3853 
3854     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3855       if (ConstantLod == 0) {
3856         // TODO: Change intrinsic opcode and remove operand instead or replacing
3857         // it with 0, as the _L to _LZ handling is done above.
3858         MI.getOperand(LodIdx).ChangeToImmediate(0);
3859         --CorrectedNumVAddrs;
3860       }
3861     }
3862   }
3863 
3864   // Rewrite the addressing register layout before doing anything else.
3865   if (IsA16 || IsG16) {
3866     if (IsA16) {
3867       // Target must support the feature and gradients need to be 16 bit too
3868       if (!ST.hasA16() || !IsG16)
3869         return false;
3870     } else if (!ST.hasG16())
3871       return false;
3872 
3873     if (NumVAddrs > 1) {
3874       SmallVector<Register, 4> PackedRegs;
3875       // Don't compress addresses for G16
3876       const int PackEndIdx =
3877           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3878       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3879                                   PackEndIdx, NumGradients);
3880 
3881       if (!IsA16) {
3882         // Add uncompressed address
3883         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3884           int AddrReg = MI.getOperand(I).getReg();
3885           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3886           PackedRegs.push_back(AddrReg);
3887         }
3888       }
3889 
3890       // See also below in the non-a16 branch
3891       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3892 
3893       if (!UseNSA && PackedRegs.size() > 1) {
3894         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3895         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3896         PackedRegs[0] = Concat.getReg(0);
3897         PackedRegs.resize(1);
3898       }
3899 
3900       const int NumPacked = PackedRegs.size();
3901       for (int I = 0; I != NumVAddrs; ++I) {
3902         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3903         if (!SrcOp.isReg()) {
3904           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3905           continue;
3906         }
3907 
3908         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3909 
3910         if (I < NumPacked)
3911           SrcOp.setReg(PackedRegs[I]);
3912         else
3913           SrcOp.setReg(AMDGPU::NoRegister);
3914       }
3915     }
3916   } else {
3917     // If the register allocator cannot place the address registers contiguously
3918     // without introducing moves, then using the non-sequential address encoding
3919     // is always preferable, since it saves VALU instructions and is usually a
3920     // wash in terms of code size or even better.
3921     //
3922     // However, we currently have no way of hinting to the register allocator
3923     // that MIMG addresses should be placed contiguously when it is possible to
3924     // do so, so force non-NSA for the common 2-address case as a heuristic.
3925     //
3926     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3927     // allocation when possible.
3928     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3929 
3930     if (!UseNSA && NumVAddrs > 1)
3931       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3932   }
3933 
3934   int Flags = 0;
3935   if (IsA16)
3936     Flags |= 1;
3937   if (IsG16)
3938     Flags |= 2;
3939   MI.addOperand(MachineOperand::CreateImm(Flags));
3940 
3941   if (BaseOpcode->Store) { // No TFE for stores?
3942     // TODO: Handle dmask trim
3943     Register VData = MI.getOperand(1).getReg();
3944     LLT Ty = MRI->getType(VData);
3945     if (!Ty.isVector() || Ty.getElementType() != S16)
3946       return true;
3947 
3948     Register RepackedReg = handleD16VData(B, *MRI, VData);
3949     if (RepackedReg != VData) {
3950       MI.getOperand(1).setReg(RepackedReg);
3951     }
3952 
3953     return true;
3954   }
3955 
3956   Register DstReg = MI.getOperand(0).getReg();
3957   LLT Ty = MRI->getType(DstReg);
3958   const LLT EltTy = Ty.getScalarType();
3959   const bool IsD16 = Ty.getScalarType() == S16;
3960   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3961 
3962   // Confirm that the return type is large enough for the dmask specified
3963   if (NumElts < DMaskLanes)
3964     return false;
3965 
3966   if (NumElts > 4 || DMaskLanes > 4)
3967     return false;
3968 
3969   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3970   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3971 
3972   // The raw dword aligned data component of the load. The only legal cases
3973   // where this matters should be when using the packed D16 format, for
3974   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3975   LLT RoundedTy;
3976 
3977   // S32 vector to to cover all data, plus TFE result element.
3978   LLT TFETy;
3979 
3980   // Register type to use for each loaded component. Will be S32 or V2S16.
3981   LLT RegTy;
3982 
3983   if (IsD16 && ST.hasUnpackedD16VMem()) {
3984     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3985     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3986     RegTy = S32;
3987   } else {
3988     unsigned EltSize = EltTy.getSizeInBits();
3989     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3990     unsigned RoundedSize = 32 * RoundedElts;
3991     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3992     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3993     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3994   }
3995 
3996   // The return type does not need adjustment.
3997   // TODO: Should we change s16 case to s32 or <2 x s16>?
3998   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3999     return true;
4000 
4001   Register Dst1Reg;
4002 
4003   // Insert after the instruction.
4004   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4005 
4006   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4007   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4008   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4009   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4010 
4011   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4012 
4013   MI.getOperand(0).setReg(NewResultReg);
4014 
4015   // In the IR, TFE is supposed to be used with a 2 element struct return
4016   // type. The intruction really returns these two values in one contiguous
4017   // register, with one additional dword beyond the loaded data. Rewrite the
4018   // return type to use a single register result.
4019 
4020   if (IsTFE) {
4021     Dst1Reg = MI.getOperand(1).getReg();
4022     if (MRI->getType(Dst1Reg) != S32)
4023       return false;
4024 
4025     // TODO: Make sure the TFE operand bit is set.
4026     MI.RemoveOperand(1);
4027 
4028     // Handle the easy case that requires no repack instructions.
4029     if (Ty == S32) {
4030       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4031       return true;
4032     }
4033   }
4034 
4035   // Now figure out how to copy the new result register back into the old
4036   // result.
4037   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4038 
4039   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4040 
4041   if (ResultNumRegs == 1) {
4042     assert(!IsTFE);
4043     ResultRegs[0] = NewResultReg;
4044   } else {
4045     // We have to repack into a new vector of some kind.
4046     for (int I = 0; I != NumDataRegs; ++I)
4047       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4048     B.buildUnmerge(ResultRegs, NewResultReg);
4049 
4050     // Drop the final TFE element to get the data part. The TFE result is
4051     // directly written to the right place already.
4052     if (IsTFE)
4053       ResultRegs.resize(NumDataRegs);
4054   }
4055 
4056   // For an s16 scalar result, we form an s32 result with a truncate regardless
4057   // of packed vs. unpacked.
4058   if (IsD16 && !Ty.isVector()) {
4059     B.buildTrunc(DstReg, ResultRegs[0]);
4060     return true;
4061   }
4062 
4063   // Avoid a build/concat_vector of 1 entry.
4064   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4065     B.buildBitcast(DstReg, ResultRegs[0]);
4066     return true;
4067   }
4068 
4069   assert(Ty.isVector());
4070 
4071   if (IsD16) {
4072     // For packed D16 results with TFE enabled, all the data components are
4073     // S32. Cast back to the expected type.
4074     //
4075     // TODO: We don't really need to use load s32 elements. We would only need one
4076     // cast for the TFE result if a multiple of v2s16 was used.
4077     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4078       for (Register &Reg : ResultRegs)
4079         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4080     } else if (ST.hasUnpackedD16VMem()) {
4081       for (Register &Reg : ResultRegs)
4082         Reg = B.buildTrunc(S16, Reg).getReg(0);
4083     }
4084   }
4085 
4086   auto padWithUndef = [&](LLT Ty, int NumElts) {
4087     if (NumElts == 0)
4088       return;
4089     Register Undef = B.buildUndef(Ty).getReg(0);
4090     for (int I = 0; I != NumElts; ++I)
4091       ResultRegs.push_back(Undef);
4092   };
4093 
4094   // Pad out any elements eliminated due to the dmask.
4095   LLT ResTy = MRI->getType(ResultRegs[0]);
4096   if (!ResTy.isVector()) {
4097     padWithUndef(ResTy, NumElts - ResultRegs.size());
4098     B.buildBuildVector(DstReg, ResultRegs);
4099     return true;
4100   }
4101 
4102   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4103   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4104 
4105   // Deal with the one annoying legal case.
4106   const LLT V3S16 = LLT::vector(3, 16);
4107   if (Ty == V3S16) {
4108     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4109     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4110     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4111     return true;
4112   }
4113 
4114   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4115   B.buildConcatVectors(DstReg, ResultRegs);
4116   return true;
4117 }
4118 
4119 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4120   MachineInstr &MI, MachineIRBuilder &B,
4121   GISelChangeObserver &Observer) const {
4122   Register Dst = MI.getOperand(0).getReg();
4123   LLT Ty = B.getMRI()->getType(Dst);
4124   unsigned Size = Ty.getSizeInBits();
4125   MachineFunction &MF = B.getMF();
4126 
4127   Observer.changingInstr(MI);
4128 
4129   // FIXME: We don't really need this intermediate instruction. The intrinsic
4130   // should be fixed to have a memory operand. Since it's readnone, we're not
4131   // allowed to add one.
4132   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4133   MI.RemoveOperand(1); // Remove intrinsic ID
4134 
4135   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4136   // TODO: Should this use datalayout alignment?
4137   const unsigned MemSize = (Size + 7) / 8;
4138   const Align MemAlign(4);
4139   MachineMemOperand *MMO = MF.getMachineMemOperand(
4140       MachinePointerInfo(),
4141       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4142           MachineMemOperand::MOInvariant,
4143       MemSize, MemAlign);
4144   MI.addMemOperand(MF, MMO);
4145 
4146   // There are no 96-bit result scalar loads, but widening to 128-bit should
4147   // always be legal. We may need to restore this to a 96-bit result if it turns
4148   // out this needs to be converted to a vector load during RegBankSelect.
4149   if (!isPowerOf2_32(Size)) {
4150     LegalizerHelper Helper(MF, *this, Observer, B);
4151 
4152     if (Ty.isVector())
4153       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4154     else
4155       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4156   }
4157 
4158   Observer.changedInstr(MI);
4159   return true;
4160 }
4161 
4162 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4163                                                 MachineRegisterInfo &MRI,
4164                                                 MachineIRBuilder &B) const {
4165   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4166   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4167       !ST.isTrapHandlerEnabled()) {
4168     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4169   } else {
4170     // Pass queue pointer to trap handler as input, and insert trap instruction
4171     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4172     const ArgDescriptor *Arg =
4173         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4174     if (!Arg)
4175       return false;
4176     MachineRegisterInfo &MRI = *B.getMRI();
4177     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4178     Register LiveIn = getLiveInRegister(
4179         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4180         /*InsertLiveInCopy=*/false);
4181     if (!loadInputValue(LiveIn, B, Arg))
4182       return false;
4183     B.buildCopy(SGPR01, LiveIn);
4184     B.buildInstr(AMDGPU::S_TRAP)
4185         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4186         .addReg(SGPR01, RegState::Implicit);
4187   }
4188 
4189   MI.eraseFromParent();
4190   return true;
4191 }
4192 
4193 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4194     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4195   // Is non-HSA path or trap-handler disabled? then, report a warning
4196   // accordingly
4197   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4198       !ST.isTrapHandlerEnabled()) {
4199     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4200                                      "debugtrap handler not supported",
4201                                      MI.getDebugLoc(), DS_Warning);
4202     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4203     Ctx.diagnose(NoTrap);
4204   } else {
4205     // Insert debug-trap instruction
4206     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4207   }
4208 
4209   MI.eraseFromParent();
4210   return true;
4211 }
4212 
4213 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4214                                             MachineInstr &MI) const {
4215   MachineIRBuilder &B = Helper.MIRBuilder;
4216   MachineRegisterInfo &MRI = *B.getMRI();
4217 
4218   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4219   auto IntrID = MI.getIntrinsicID();
4220   switch (IntrID) {
4221   case Intrinsic::amdgcn_if:
4222   case Intrinsic::amdgcn_else: {
4223     MachineInstr *Br = nullptr;
4224     MachineBasicBlock *UncondBrTarget = nullptr;
4225     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4226       const SIRegisterInfo *TRI
4227         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4228 
4229       Register Def = MI.getOperand(1).getReg();
4230       Register Use = MI.getOperand(3).getReg();
4231 
4232       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4233       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4234       if (IntrID == Intrinsic::amdgcn_if) {
4235         B.buildInstr(AMDGPU::SI_IF)
4236           .addDef(Def)
4237           .addUse(Use)
4238           .addMBB(UncondBrTarget);
4239       } else {
4240         B.buildInstr(AMDGPU::SI_ELSE)
4241           .addDef(Def)
4242           .addUse(Use)
4243           .addMBB(UncondBrTarget)
4244           .addImm(0);
4245       }
4246 
4247       if (Br) {
4248         Br->getOperand(0).setMBB(CondBrTarget);
4249       } else {
4250         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4251         // since we're swapping branch targets it needs to be reinserted.
4252         // FIXME: IRTranslator should probably not do this
4253         B.buildBr(*CondBrTarget);
4254       }
4255 
4256       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4257       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4258       MI.eraseFromParent();
4259       BrCond->eraseFromParent();
4260       return true;
4261     }
4262 
4263     return false;
4264   }
4265   case Intrinsic::amdgcn_loop: {
4266     MachineInstr *Br = nullptr;
4267     MachineBasicBlock *UncondBrTarget = nullptr;
4268     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4269       const SIRegisterInfo *TRI
4270         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4271 
4272       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4273       Register Reg = MI.getOperand(2).getReg();
4274 
4275       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4276       B.buildInstr(AMDGPU::SI_LOOP)
4277         .addUse(Reg)
4278         .addMBB(UncondBrTarget);
4279 
4280       if (Br)
4281         Br->getOperand(0).setMBB(CondBrTarget);
4282       else
4283         B.buildBr(*CondBrTarget);
4284 
4285       MI.eraseFromParent();
4286       BrCond->eraseFromParent();
4287       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4288       return true;
4289     }
4290 
4291     return false;
4292   }
4293   case Intrinsic::amdgcn_kernarg_segment_ptr:
4294     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4295       // This only makes sense to call in a kernel, so just lower to null.
4296       B.buildConstant(MI.getOperand(0).getReg(), 0);
4297       MI.eraseFromParent();
4298       return true;
4299     }
4300 
4301     return legalizePreloadedArgIntrin(
4302       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4303   case Intrinsic::amdgcn_implicitarg_ptr:
4304     return legalizeImplicitArgPtr(MI, MRI, B);
4305   case Intrinsic::amdgcn_workitem_id_x:
4306     return legalizePreloadedArgIntrin(MI, MRI, B,
4307                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4308   case Intrinsic::amdgcn_workitem_id_y:
4309     return legalizePreloadedArgIntrin(MI, MRI, B,
4310                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4311   case Intrinsic::amdgcn_workitem_id_z:
4312     return legalizePreloadedArgIntrin(MI, MRI, B,
4313                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4314   case Intrinsic::amdgcn_workgroup_id_x:
4315     return legalizePreloadedArgIntrin(MI, MRI, B,
4316                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4317   case Intrinsic::amdgcn_workgroup_id_y:
4318     return legalizePreloadedArgIntrin(MI, MRI, B,
4319                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4320   case Intrinsic::amdgcn_workgroup_id_z:
4321     return legalizePreloadedArgIntrin(MI, MRI, B,
4322                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4323   case Intrinsic::amdgcn_dispatch_ptr:
4324     return legalizePreloadedArgIntrin(MI, MRI, B,
4325                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4326   case Intrinsic::amdgcn_queue_ptr:
4327     return legalizePreloadedArgIntrin(MI, MRI, B,
4328                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4329   case Intrinsic::amdgcn_implicit_buffer_ptr:
4330     return legalizePreloadedArgIntrin(
4331       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4332   case Intrinsic::amdgcn_dispatch_id:
4333     return legalizePreloadedArgIntrin(MI, MRI, B,
4334                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4335   case Intrinsic::amdgcn_fdiv_fast:
4336     return legalizeFDIVFastIntrin(MI, MRI, B);
4337   case Intrinsic::amdgcn_is_shared:
4338     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4339   case Intrinsic::amdgcn_is_private:
4340     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4341   case Intrinsic::amdgcn_wavefrontsize: {
4342     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4343     MI.eraseFromParent();
4344     return true;
4345   }
4346   case Intrinsic::amdgcn_s_buffer_load:
4347     return legalizeSBufferLoad(MI, B, Helper.Observer);
4348   case Intrinsic::amdgcn_raw_buffer_store:
4349   case Intrinsic::amdgcn_struct_buffer_store:
4350     return legalizeBufferStore(MI, MRI, B, false, false);
4351   case Intrinsic::amdgcn_raw_buffer_store_format:
4352   case Intrinsic::amdgcn_struct_buffer_store_format:
4353     return legalizeBufferStore(MI, MRI, B, false, true);
4354   case Intrinsic::amdgcn_raw_tbuffer_store:
4355   case Intrinsic::amdgcn_struct_tbuffer_store:
4356     return legalizeBufferStore(MI, MRI, B, true, true);
4357   case Intrinsic::amdgcn_raw_buffer_load:
4358   case Intrinsic::amdgcn_struct_buffer_load:
4359     return legalizeBufferLoad(MI, MRI, B, false, false);
4360   case Intrinsic::amdgcn_raw_buffer_load_format:
4361   case Intrinsic::amdgcn_struct_buffer_load_format:
4362     return legalizeBufferLoad(MI, MRI, B, true, false);
4363   case Intrinsic::amdgcn_raw_tbuffer_load:
4364   case Intrinsic::amdgcn_struct_tbuffer_load:
4365     return legalizeBufferLoad(MI, MRI, B, true, true);
4366   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4367   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4368   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4369   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4370   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4371   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4372   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4373   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4374   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4375   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4376   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4377   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4378   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4379   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4380   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4381   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4382   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4383   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4384   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4385   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4386   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4387   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4388   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4389   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4390   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4391   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4392     return legalizeBufferAtomic(MI, B, IntrID);
4393   case Intrinsic::amdgcn_atomic_inc:
4394     return legalizeAtomicIncDec(MI, B, true);
4395   case Intrinsic::amdgcn_atomic_dec:
4396     return legalizeAtomicIncDec(MI, B, false);
4397   case Intrinsic::trap:
4398     return legalizeTrapIntrinsic(MI, MRI, B);
4399   case Intrinsic::debugtrap:
4400     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4401   default: {
4402     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4403             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4404       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4405     return true;
4406   }
4407   }
4408 
4409   return true;
4410 }
4411