1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .legalIf(isPointer(0))
419     .clampScalar(0, S32, S256)
420     .widenScalarToNextPow2(0, 32)
421     .clampMaxNumElements(0, S32, 16)
422     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
423     .scalarize(0);
424 
425   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
426     // Full set of gfx9 features.
427     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
428       .legalFor({S32, S16, V2S16})
429       .clampScalar(0, S16, S32)
430       .clampMaxNumElements(0, S16, 2)
431       .scalarize(0)
432       .widenScalarToNextPow2(0, 32);
433 
434     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
435       .legalFor({S32, S16, V2S16}) // Clamp modifier
436       .minScalar(0, S16)
437       .clampMaxNumElements(0, S16, 2)
438       .scalarize(0)
439       .widenScalarToNextPow2(0, 32)
440       .lower();
441   } else if (ST.has16BitInsts()) {
442     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
443       .legalFor({S32, S16})
444       .clampScalar(0, S16, S32)
445       .scalarize(0)
446       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
447 
448     // Technically the saturating operations require clamp bit support, but this
449     // was introduced at the same time as 16-bit operations.
450     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
451       .legalFor({S32, S16}) // Clamp modifier
452       .minScalar(0, S16)
453       .scalarize(0)
454       .widenScalarToNextPow2(0, 16)
455       .lower();
456 
457     // We're just lowering this, but it helps get a better result to try to
458     // coerce to the desired type first.
459     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
460       .minScalar(0, S16)
461       .scalarize(0)
462       .lower();
463   } else {
464     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
465       .legalFor({S32})
466       .clampScalar(0, S32, S32)
467       .scalarize(0);
468 
469     if (ST.hasIntClamp()) {
470       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
471         .legalFor({S32}) // Clamp modifier.
472         .scalarize(0)
473         .minScalarOrElt(0, S32)
474         .lower();
475     } else {
476       // Clamp bit support was added in VI, along with 16-bit operations.
477       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
478         .minScalar(0, S32)
479         .scalarize(0)
480         .lower();
481     }
482 
483     // FIXME: DAG expansion gets better results. The widening uses the smaller
484     // range values and goes for the min/max lowering directly.
485     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
486       .minScalar(0, S32)
487       .scalarize(0)
488       .lower();
489   }
490 
491   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
492     .customFor({S32, S64})
493     .clampScalar(0, S32, S64)
494     .widenScalarToNextPow2(0, 32)
495     .scalarize(0);
496 
497   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
498     .legalFor({S32})
499     .clampScalar(0, S32, S32)
500     .scalarize(0);
501 
502   // Report legal for any types we can handle anywhere. For the cases only legal
503   // on the SALU, RegBankSelect will be able to re-legalize.
504   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
505     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
506     .clampScalar(0, S32, S64)
507     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
508     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
509     .widenScalarToNextPow2(0)
510     .scalarize(0);
511 
512   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
513                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
514     .legalFor({{S32, S1}, {S32, S32}})
515     .minScalar(0, S32)
516     // TODO: .scalarize(0)
517     .lower();
518 
519   getActionDefinitionsBuilder(G_BITCAST)
520     // Don't worry about the size constraint.
521     .legalIf(all(isRegisterType(0), isRegisterType(1)))
522     .lower();
523 
524 
525   getActionDefinitionsBuilder(G_CONSTANT)
526     .legalFor({S1, S32, S64, S16, GlobalPtr,
527                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
528     .legalIf(isPointer(0))
529     .clampScalar(0, S32, S64)
530     .widenScalarToNextPow2(0);
531 
532   getActionDefinitionsBuilder(G_FCONSTANT)
533     .legalFor({S32, S64, S16})
534     .clampScalar(0, S16, S64);
535 
536   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
537       .legalIf(isRegisterType(0))
538       // s1 and s16 are special cases because they have legal operations on
539       // them, but don't really occupy registers in the normal way.
540       .legalFor({S1, S16})
541       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
542       .clampScalarOrElt(0, S32, MaxScalar)
543       .widenScalarToNextPow2(0, 32)
544       .clampMaxNumElements(0, S32, 16);
545 
546   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
547 
548   // If the amount is divergent, we have to do a wave reduction to get the
549   // maximum value, so this is expanded during RegBankSelect.
550   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
551     .legalFor({{PrivatePtr, S32}});
552 
553   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
554     .customIf(typeIsNot(0, PrivatePtr));
555 
556   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
557 
558   auto &FPOpActions = getActionDefinitionsBuilder(
559     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
560     .legalFor({S32, S64});
561   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
562     .customFor({S32, S64});
563   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
564     .customFor({S32, S64});
565 
566   if (ST.has16BitInsts()) {
567     if (ST.hasVOP3PInsts())
568       FPOpActions.legalFor({S16, V2S16});
569     else
570       FPOpActions.legalFor({S16});
571 
572     TrigActions.customFor({S16});
573     FDIVActions.customFor({S16});
574   }
575 
576   auto &MinNumMaxNum = getActionDefinitionsBuilder({
577       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
578 
579   if (ST.hasVOP3PInsts()) {
580     MinNumMaxNum.customFor(FPTypesPK16)
581       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
582       .clampMaxNumElements(0, S16, 2)
583       .clampScalar(0, S16, S64)
584       .scalarize(0);
585   } else if (ST.has16BitInsts()) {
586     MinNumMaxNum.customFor(FPTypes16)
587       .clampScalar(0, S16, S64)
588       .scalarize(0);
589   } else {
590     MinNumMaxNum.customFor(FPTypesBase)
591       .clampScalar(0, S32, S64)
592       .scalarize(0);
593   }
594 
595   if (ST.hasVOP3PInsts())
596     FPOpActions.clampMaxNumElements(0, S16, 2);
597 
598   FPOpActions
599     .scalarize(0)
600     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
601 
602   TrigActions
603     .scalarize(0)
604     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
605 
606   FDIVActions
607     .scalarize(0)
608     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
609 
610   getActionDefinitionsBuilder({G_FNEG, G_FABS})
611     .legalFor(FPTypesPK16)
612     .clampMaxNumElements(0, S16, 2)
613     .scalarize(0)
614     .clampScalar(0, S16, S64);
615 
616   if (ST.has16BitInsts()) {
617     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
618       .legalFor({S32, S64, S16})
619       .scalarize(0)
620       .clampScalar(0, S16, S64);
621   } else {
622     getActionDefinitionsBuilder(G_FSQRT)
623       .legalFor({S32, S64})
624       .scalarize(0)
625       .clampScalar(0, S32, S64);
626 
627     if (ST.hasFractBug()) {
628       getActionDefinitionsBuilder(G_FFLOOR)
629         .customFor({S64})
630         .legalFor({S32, S64})
631         .scalarize(0)
632         .clampScalar(0, S32, S64);
633     } else {
634       getActionDefinitionsBuilder(G_FFLOOR)
635         .legalFor({S32, S64})
636         .scalarize(0)
637         .clampScalar(0, S32, S64);
638     }
639   }
640 
641   getActionDefinitionsBuilder(G_FPTRUNC)
642     .legalFor({{S32, S64}, {S16, S32}})
643     .scalarize(0)
644     .lower();
645 
646   getActionDefinitionsBuilder(G_FPEXT)
647     .legalFor({{S64, S32}, {S32, S16}})
648     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
649     .scalarize(0);
650 
651   getActionDefinitionsBuilder(G_FSUB)
652       // Use actual fsub instruction
653       .legalFor({S32})
654       // Must use fadd + fneg
655       .lowerFor({S64, S16, V2S16})
656       .scalarize(0)
657       .clampScalar(0, S32, S64);
658 
659   // Whether this is legal depends on the floating point mode for the function.
660   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
661   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
662     FMad.customFor({S32, S16});
663   else if (ST.hasMadMacF32Insts())
664     FMad.customFor({S32});
665   else if (ST.hasMadF16())
666     FMad.customFor({S16});
667   FMad.scalarize(0)
668       .lower();
669 
670   // TODO: Do we need to clamp maximum bitwidth?
671   getActionDefinitionsBuilder(G_TRUNC)
672     .legalIf(isScalar(0))
673     .legalFor({{V2S16, V2S32}})
674     .clampMaxNumElements(0, S16, 2)
675     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
676     // situations (like an invalid implicit use), we don't want to infinite loop
677     // in the legalizer.
678     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
679     .alwaysLegal();
680 
681   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
682     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
683                {S32, S1}, {S64, S1}, {S16, S1}})
684     .scalarize(0)
685     .clampScalar(0, S32, S64)
686     .widenScalarToNextPow2(1, 32);
687 
688   // TODO: Split s1->s64 during regbankselect for VALU.
689   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
690     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
691     .lowerFor({{S32, S64}})
692     .lowerIf(typeIs(1, S1))
693     .customFor({{S64, S64}});
694   if (ST.has16BitInsts())
695     IToFP.legalFor({{S16, S16}});
696   IToFP.clampScalar(1, S32, S64)
697        .minScalar(0, S32)
698        .scalarize(0)
699        .widenScalarToNextPow2(1);
700 
701   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
702     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
703     .customFor({{S64, S64}})
704     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
705   if (ST.has16BitInsts())
706     FPToI.legalFor({{S16, S16}});
707   else
708     FPToI.minScalar(1, S32);
709 
710   FPToI.minScalar(0, S32)
711        .scalarize(0)
712        .lower();
713 
714   // Lower roundeven into G_FRINT
715   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
716     .scalarize(0)
717     .lower();
718 
719   if (ST.has16BitInsts()) {
720     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
721       .legalFor({S16, S32, S64})
722       .clampScalar(0, S16, S64)
723       .scalarize(0);
724   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
725     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
726       .legalFor({S32, S64})
727       .clampScalar(0, S32, S64)
728       .scalarize(0);
729   } else {
730     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
731       .legalFor({S32})
732       .customFor({S64})
733       .clampScalar(0, S32, S64)
734       .scalarize(0);
735   }
736 
737   getActionDefinitionsBuilder(G_PTR_ADD)
738     .legalIf(all(isPointer(0), sameSize(0, 1)))
739     .scalarize(0)
740     .scalarSameSizeAs(1, 0);
741 
742   getActionDefinitionsBuilder(G_PTRMASK)
743     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
744     .scalarSameSizeAs(1, 0)
745     .scalarize(0);
746 
747   auto &CmpBuilder =
748     getActionDefinitionsBuilder(G_ICMP)
749     // The compare output type differs based on the register bank of the output,
750     // so make both s1 and s32 legal.
751     //
752     // Scalar compares producing output in scc will be promoted to s32, as that
753     // is the allocatable register type that will be needed for the copy from
754     // scc. This will be promoted during RegBankSelect, and we assume something
755     // before that won't try to use s32 result types.
756     //
757     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
758     // bank.
759     .legalForCartesianProduct(
760       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
761     .legalForCartesianProduct(
762       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
763   if (ST.has16BitInsts()) {
764     CmpBuilder.legalFor({{S1, S16}});
765   }
766 
767   CmpBuilder
768     .widenScalarToNextPow2(1)
769     .clampScalar(1, S32, S64)
770     .scalarize(0)
771     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
772 
773   getActionDefinitionsBuilder(G_FCMP)
774     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
775     .widenScalarToNextPow2(1)
776     .clampScalar(1, S32, S64)
777     .scalarize(0);
778 
779   // FIXME: fpow has a selection pattern that should move to custom lowering.
780   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
781   if (ST.has16BitInsts())
782     Exp2Ops.legalFor({S32, S16});
783   else
784     Exp2Ops.legalFor({S32});
785   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
786   Exp2Ops.scalarize(0);
787 
788   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
789   if (ST.has16BitInsts())
790     ExpOps.customFor({{S32}, {S16}});
791   else
792     ExpOps.customFor({S32});
793   ExpOps.clampScalar(0, MinScalarFPTy, S32)
794         .scalarize(0);
795 
796   getActionDefinitionsBuilder(G_FPOWI)
797     .clampScalar(0, MinScalarFPTy, S32)
798     .lower();
799 
800   // The 64-bit versions produce 32-bit results, but only on the SALU.
801   getActionDefinitionsBuilder(G_CTPOP)
802     .legalFor({{S32, S32}, {S32, S64}})
803     .clampScalar(0, S32, S32)
804     .clampScalar(1, S32, S64)
805     .scalarize(0)
806     .widenScalarToNextPow2(0, 32)
807     .widenScalarToNextPow2(1, 32);
808 
809   // The hardware instructions return a different result on 0 than the generic
810   // instructions expect. The hardware produces -1, but these produce the
811   // bitwidth.
812   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
813     .scalarize(0)
814     .clampScalar(0, S32, S32)
815     .clampScalar(1, S32, S64)
816     .widenScalarToNextPow2(0, 32)
817     .widenScalarToNextPow2(1, 32)
818     .lower();
819 
820   // The 64-bit versions produce 32-bit results, but only on the SALU.
821   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
822     .legalFor({{S32, S32}, {S32, S64}})
823     .clampScalar(0, S32, S32)
824     .clampScalar(1, S32, S64)
825     .scalarize(0)
826     .widenScalarToNextPow2(0, 32)
827     .widenScalarToNextPow2(1, 32);
828 
829   getActionDefinitionsBuilder(G_BITREVERSE)
830     .legalFor({S32})
831     .clampScalar(0, S32, S32)
832     .scalarize(0);
833 
834   if (ST.has16BitInsts()) {
835     getActionDefinitionsBuilder(G_BSWAP)
836       .legalFor({S16, S32, V2S16})
837       .clampMaxNumElements(0, S16, 2)
838       // FIXME: Fixing non-power-of-2 before clamp is workaround for
839       // narrowScalar limitation.
840       .widenScalarToNextPow2(0)
841       .clampScalar(0, S16, S32)
842       .scalarize(0);
843 
844     if (ST.hasVOP3PInsts()) {
845       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
846         .legalFor({S32, S16, V2S16})
847         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
848         .clampMaxNumElements(0, S16, 2)
849         .minScalar(0, S16)
850         .widenScalarToNextPow2(0)
851         .scalarize(0)
852         .lower();
853     } else {
854       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
855         .legalFor({S32, S16})
856         .widenScalarToNextPow2(0)
857         .minScalar(0, S16)
858         .scalarize(0)
859         .lower();
860     }
861   } else {
862     // TODO: Should have same legality without v_perm_b32
863     getActionDefinitionsBuilder(G_BSWAP)
864       .legalFor({S32})
865       .lowerIf(scalarNarrowerThan(0, 32))
866       // FIXME: Fixing non-power-of-2 before clamp is workaround for
867       // narrowScalar limitation.
868       .widenScalarToNextPow2(0)
869       .maxScalar(0, S32)
870       .scalarize(0)
871       .lower();
872 
873     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
874       .legalFor({S32})
875       .minScalar(0, S32)
876       .widenScalarToNextPow2(0)
877       .scalarize(0)
878       .lower();
879   }
880 
881   getActionDefinitionsBuilder(G_INTTOPTR)
882     // List the common cases
883     .legalForCartesianProduct(AddrSpaces64, {S64})
884     .legalForCartesianProduct(AddrSpaces32, {S32})
885     .scalarize(0)
886     // Accept any address space as long as the size matches
887     .legalIf(sameSize(0, 1))
888     .widenScalarIf(smallerThan(1, 0),
889       [](const LegalityQuery &Query) {
890         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
891       })
892     .narrowScalarIf(largerThan(1, 0),
893       [](const LegalityQuery &Query) {
894         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
895       });
896 
897   getActionDefinitionsBuilder(G_PTRTOINT)
898     // List the common cases
899     .legalForCartesianProduct(AddrSpaces64, {S64})
900     .legalForCartesianProduct(AddrSpaces32, {S32})
901     .scalarize(0)
902     // Accept any address space as long as the size matches
903     .legalIf(sameSize(0, 1))
904     .widenScalarIf(smallerThan(0, 1),
905       [](const LegalityQuery &Query) {
906         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
907       })
908     .narrowScalarIf(
909       largerThan(0, 1),
910       [](const LegalityQuery &Query) {
911         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
912       });
913 
914   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
915     .scalarize(0)
916     .custom();
917 
918   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
919                                     bool IsLoad) -> bool {
920     const LLT DstTy = Query.Types[0];
921 
922     // Split vector extloads.
923     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
924     unsigned Align = Query.MMODescrs[0].AlignInBits;
925 
926     if (MemSize < DstTy.getSizeInBits())
927       MemSize = std::max(MemSize, Align);
928 
929     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
930       return true;
931 
932     const LLT PtrTy = Query.Types[1];
933     unsigned AS = PtrTy.getAddressSpace();
934     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
935       return true;
936 
937     // Catch weird sized loads that don't evenly divide into the access sizes
938     // TODO: May be able to widen depending on alignment etc.
939     unsigned NumRegs = (MemSize + 31) / 32;
940     if (NumRegs == 3) {
941       if (!ST.hasDwordx3LoadStores())
942         return true;
943     } else {
944       // If the alignment allows, these should have been widened.
945       if (!isPowerOf2_32(NumRegs))
946         return true;
947     }
948 
949     if (Align < MemSize) {
950       const SITargetLowering *TLI = ST.getTargetLowering();
951       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
952     }
953 
954     return false;
955   };
956 
957   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
958                                          unsigned Opc) -> bool {
959     unsigned Size = Query.Types[0].getSizeInBits();
960     if (isPowerOf2_32(Size))
961       return false;
962 
963     if (Size == 96 && ST.hasDwordx3LoadStores())
964       return false;
965 
966     unsigned AddrSpace = Query.Types[1].getAddressSpace();
967     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
968       return false;
969 
970     unsigned Align = Query.MMODescrs[0].AlignInBits;
971     unsigned RoundedSize = NextPowerOf2(Size);
972     return (Align >= RoundedSize);
973   };
974 
975   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
976   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
977   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
978 
979   // TODO: Refine based on subtargets which support unaligned access or 128-bit
980   // LDS
981   // TODO: Unsupported flat for SI.
982 
983   for (unsigned Op : {G_LOAD, G_STORE}) {
984     const bool IsStore = Op == G_STORE;
985 
986     auto &Actions = getActionDefinitionsBuilder(Op);
987     // Explicitly list some common cases.
988     // TODO: Does this help compile time at all?
989     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
990                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
991                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
992                                       {S64, GlobalPtr, 64, GlobalAlign32},
993                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
994                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
995                                       {S32, GlobalPtr, 8, GlobalAlign8},
996                                       {S32, GlobalPtr, 16, GlobalAlign16},
997 
998                                       {S32, LocalPtr, 32, 32},
999                                       {S64, LocalPtr, 64, 32},
1000                                       {V2S32, LocalPtr, 64, 32},
1001                                       {S32, LocalPtr, 8, 8},
1002                                       {S32, LocalPtr, 16, 16},
1003                                       {V2S16, LocalPtr, 32, 32},
1004 
1005                                       {S32, PrivatePtr, 32, 32},
1006                                       {S32, PrivatePtr, 8, 8},
1007                                       {S32, PrivatePtr, 16, 16},
1008                                       {V2S16, PrivatePtr, 32, 32},
1009 
1010                                       {S32, ConstantPtr, 32, GlobalAlign32},
1011                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1012                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1013                                       {S64, ConstantPtr, 64, GlobalAlign32},
1014                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1015     Actions.legalIf(
1016       [=](const LegalityQuery &Query) -> bool {
1017         return isLoadStoreLegal(ST, Query, Op);
1018       });
1019 
1020     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1021     // 64-bits.
1022     //
1023     // TODO: Should generalize bitcast action into coerce, which will also cover
1024     // inserting addrspacecasts.
1025     Actions.customIf(typeIs(1, Constant32Ptr));
1026 
1027     // Turn any illegal element vectors into something easier to deal
1028     // with. These will ultimately produce 32-bit scalar shifts to extract the
1029     // parts anyway.
1030     //
1031     // For odd 16-bit element vectors, prefer to split those into pieces with
1032     // 16-bit vector parts.
1033     Actions.bitcastIf(
1034       [=](const LegalityQuery &Query) -> bool {
1035         const LLT Ty = Query.Types[0];
1036         const unsigned Size = Ty.getSizeInBits();
1037 
1038         if (Size != Query.MMODescrs[0].SizeInBits)
1039           return Size <= 32 && Ty.isVector();
1040 
1041         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
1042           return true;
1043         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
1044                !isRegisterVectorElementType(Ty.getElementType());
1045       }, bitcastToRegisterType(0));
1046 
1047     Actions
1048         .customIf(typeIs(1, Constant32Ptr))
1049         // Widen suitably aligned loads by loading extra elements.
1050         .moreElementsIf([=](const LegalityQuery &Query) {
1051             const LLT Ty = Query.Types[0];
1052             return Op == G_LOAD && Ty.isVector() &&
1053                    shouldWidenLoadResult(Query, Op);
1054           }, moreElementsToNextPow2(0))
1055         .widenScalarIf([=](const LegalityQuery &Query) {
1056             const LLT Ty = Query.Types[0];
1057             return Op == G_LOAD && !Ty.isVector() &&
1058                    shouldWidenLoadResult(Query, Op);
1059           }, widenScalarOrEltToNextPow2(0))
1060         .narrowScalarIf(
1061             [=](const LegalityQuery &Query) -> bool {
1062               return !Query.Types[0].isVector() &&
1063                      needToSplitMemOp(Query, Op == G_LOAD);
1064             },
1065             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1066               const LLT DstTy = Query.Types[0];
1067               const LLT PtrTy = Query.Types[1];
1068 
1069               const unsigned DstSize = DstTy.getSizeInBits();
1070               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1071 
1072               // Split extloads.
1073               if (DstSize > MemSize)
1074                 return std::make_pair(0, LLT::scalar(MemSize));
1075 
1076               if (!isPowerOf2_32(DstSize)) {
1077                 // We're probably decomposing an odd sized store. Try to split
1078                 // to the widest type. TODO: Account for alignment. As-is it
1079                 // should be OK, since the new parts will be further legalized.
1080                 unsigned FloorSize = PowerOf2Floor(DstSize);
1081                 return std::make_pair(0, LLT::scalar(FloorSize));
1082               }
1083 
1084               if (DstSize > 32 && (DstSize % 32 != 0)) {
1085                 // FIXME: Need a way to specify non-extload of larger size if
1086                 // suitably aligned.
1087                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1088               }
1089 
1090               unsigned MaxSize = maxSizeForAddrSpace(ST,
1091                                                      PtrTy.getAddressSpace(),
1092                                                      Op == G_LOAD);
1093               if (MemSize > MaxSize)
1094                 return std::make_pair(0, LLT::scalar(MaxSize));
1095 
1096               unsigned Align = Query.MMODescrs[0].AlignInBits;
1097               return std::make_pair(0, LLT::scalar(Align));
1098             })
1099         .fewerElementsIf(
1100             [=](const LegalityQuery &Query) -> bool {
1101               return Query.Types[0].isVector() &&
1102                      needToSplitMemOp(Query, Op == G_LOAD);
1103             },
1104             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1105               const LLT DstTy = Query.Types[0];
1106               const LLT PtrTy = Query.Types[1];
1107 
1108               LLT EltTy = DstTy.getElementType();
1109               unsigned MaxSize = maxSizeForAddrSpace(ST,
1110                                                      PtrTy.getAddressSpace(),
1111                                                      Op == G_LOAD);
1112 
1113               // FIXME: Handle widened to power of 2 results better. This ends
1114               // up scalarizing.
1115               // FIXME: 3 element stores scalarized on SI
1116 
1117               // Split if it's too large for the address space.
1118               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1119                 unsigned NumElts = DstTy.getNumElements();
1120                 unsigned EltSize = EltTy.getSizeInBits();
1121 
1122                 if (MaxSize % EltSize == 0) {
1123                   return std::make_pair(
1124                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1125                 }
1126 
1127                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1128 
1129                 // FIXME: Refine when odd breakdowns handled
1130                 // The scalars will need to be re-legalized.
1131                 if (NumPieces == 1 || NumPieces >= NumElts ||
1132                     NumElts % NumPieces != 0)
1133                   return std::make_pair(0, EltTy);
1134 
1135                 return std::make_pair(0,
1136                                       LLT::vector(NumElts / NumPieces, EltTy));
1137               }
1138 
1139               // FIXME: We could probably handle weird extending loads better.
1140               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1141               if (DstTy.getSizeInBits() > MemSize)
1142                 return std::make_pair(0, EltTy);
1143 
1144               unsigned EltSize = EltTy.getSizeInBits();
1145               unsigned DstSize = DstTy.getSizeInBits();
1146               if (!isPowerOf2_32(DstSize)) {
1147                 // We're probably decomposing an odd sized store. Try to split
1148                 // to the widest type. TODO: Account for alignment. As-is it
1149                 // should be OK, since the new parts will be further legalized.
1150                 unsigned FloorSize = PowerOf2Floor(DstSize);
1151                 return std::make_pair(
1152                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1153               }
1154 
1155               // Need to split because of alignment.
1156               unsigned Align = Query.MMODescrs[0].AlignInBits;
1157               if (EltSize > Align &&
1158                   (EltSize / Align < DstTy.getNumElements())) {
1159                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1160               }
1161 
1162               // May need relegalization for the scalars.
1163               return std::make_pair(0, EltTy);
1164             })
1165         .minScalar(0, S32);
1166 
1167     if (IsStore)
1168       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1169 
1170     // TODO: Need a bitcast lower option?
1171     Actions
1172         .widenScalarToNextPow2(0)
1173         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1174   }
1175 
1176   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1177                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1178                                                   {S32, GlobalPtr, 16, 2 * 8},
1179                                                   {S32, LocalPtr, 8, 8},
1180                                                   {S32, LocalPtr, 16, 16},
1181                                                   {S32, PrivatePtr, 8, 8},
1182                                                   {S32, PrivatePtr, 16, 16},
1183                                                   {S32, ConstantPtr, 8, 8},
1184                                                   {S32, ConstantPtr, 16, 2 * 8}});
1185   if (ST.hasFlatAddressSpace()) {
1186     ExtLoads.legalForTypesWithMemDesc(
1187         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1188   }
1189 
1190   ExtLoads.clampScalar(0, S32, S32)
1191           .widenScalarToNextPow2(0)
1192           .unsupportedIfMemSizeNotPow2()
1193           .lower();
1194 
1195   auto &Atomics = getActionDefinitionsBuilder(
1196     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1197      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1198      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1199      G_ATOMICRMW_UMIN})
1200     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1201                {S64, GlobalPtr}, {S64, LocalPtr},
1202                {S32, RegionPtr}, {S64, RegionPtr}});
1203   if (ST.hasFlatAddressSpace()) {
1204     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1205   }
1206 
1207   if (ST.hasLDSFPAtomics()) {
1208     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1209       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1210   }
1211 
1212   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1213   // demarshalling
1214   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1215     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1216                 {S32, FlatPtr}, {S64, FlatPtr}})
1217     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1218                {S32, RegionPtr}, {S64, RegionPtr}});
1219   // TODO: Pointer types, any 32-bit or 64-bit vector
1220 
1221   // Condition should be s32 for scalar, s1 for vector.
1222   getActionDefinitionsBuilder(G_SELECT)
1223     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1224           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1225           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1226     .clampScalar(0, S16, S64)
1227     .scalarize(1)
1228     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1229     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1230     .clampMaxNumElements(0, S32, 2)
1231     .clampMaxNumElements(0, LocalPtr, 2)
1232     .clampMaxNumElements(0, PrivatePtr, 2)
1233     .scalarize(0)
1234     .widenScalarToNextPow2(0)
1235     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1236 
1237   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1238   // be more flexible with the shift amount type.
1239   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1240     .legalFor({{S32, S32}, {S64, S32}});
1241   if (ST.has16BitInsts()) {
1242     if (ST.hasVOP3PInsts()) {
1243       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1244             .clampMaxNumElements(0, S16, 2);
1245     } else
1246       Shifts.legalFor({{S16, S16}});
1247 
1248     // TODO: Support 16-bit shift amounts for all types
1249     Shifts.widenScalarIf(
1250       [=](const LegalityQuery &Query) {
1251         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1252         // 32-bit amount.
1253         const LLT ValTy = Query.Types[0];
1254         const LLT AmountTy = Query.Types[1];
1255         return ValTy.getSizeInBits() <= 16 &&
1256                AmountTy.getSizeInBits() < 16;
1257       }, changeTo(1, S16));
1258     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1259     Shifts.clampScalar(1, S32, S32);
1260     Shifts.clampScalar(0, S16, S64);
1261     Shifts.widenScalarToNextPow2(0, 16);
1262   } else {
1263     // Make sure we legalize the shift amount type first, as the general
1264     // expansion for the shifted type will produce much worse code if it hasn't
1265     // been truncated already.
1266     Shifts.clampScalar(1, S32, S32);
1267     Shifts.clampScalar(0, S32, S64);
1268     Shifts.widenScalarToNextPow2(0, 32);
1269   }
1270   Shifts.scalarize(0);
1271 
1272   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1273     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1274     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1275     unsigned IdxTypeIdx = 2;
1276 
1277     getActionDefinitionsBuilder(Op)
1278       .customIf([=](const LegalityQuery &Query) {
1279           const LLT EltTy = Query.Types[EltTypeIdx];
1280           const LLT VecTy = Query.Types[VecTypeIdx];
1281           const LLT IdxTy = Query.Types[IdxTypeIdx];
1282           return (EltTy.getSizeInBits() == 16 ||
1283                   EltTy.getSizeInBits() % 32 == 0) &&
1284                  VecTy.getSizeInBits() % 32 == 0 &&
1285                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1286                  IdxTy.getSizeInBits() == 32;
1287         })
1288       .clampScalar(EltTypeIdx, S32, S64)
1289       .clampScalar(VecTypeIdx, S32, S64)
1290       .clampScalar(IdxTypeIdx, S32, S32)
1291       // TODO: Clamp the number of elements before resorting to stack lowering.
1292       // It should only be necessary with variable indexes.
1293       // As a last resort, lower to the stack
1294       .lower();
1295   }
1296 
1297   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1298     .unsupportedIf([=](const LegalityQuery &Query) {
1299         const LLT &EltTy = Query.Types[1].getElementType();
1300         return Query.Types[0] != EltTy;
1301       });
1302 
1303   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1304     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1305     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1306 
1307     // FIXME: Doesn't handle extract of illegal sizes.
1308     getActionDefinitionsBuilder(Op)
1309       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1310       // FIXME: Multiples of 16 should not be legal.
1311       .legalIf([=](const LegalityQuery &Query) {
1312           const LLT BigTy = Query.Types[BigTyIdx];
1313           const LLT LitTy = Query.Types[LitTyIdx];
1314           return (BigTy.getSizeInBits() % 32 == 0) &&
1315                  (LitTy.getSizeInBits() % 16 == 0);
1316         })
1317       .widenScalarIf(
1318         [=](const LegalityQuery &Query) {
1319           const LLT BigTy = Query.Types[BigTyIdx];
1320           return (BigTy.getScalarSizeInBits() < 16);
1321         },
1322         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1323       .widenScalarIf(
1324         [=](const LegalityQuery &Query) {
1325           const LLT LitTy = Query.Types[LitTyIdx];
1326           return (LitTy.getScalarSizeInBits() < 16);
1327         },
1328         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1329       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1330       .widenScalarToNextPow2(BigTyIdx, 32);
1331 
1332   }
1333 
1334   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1335     .legalForCartesianProduct(AllS32Vectors, {S32})
1336     .legalForCartesianProduct(AllS64Vectors, {S64})
1337     .clampNumElements(0, V16S32, V32S32)
1338     .clampNumElements(0, V2S64, V16S64)
1339     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1340 
1341   if (ST.hasScalarPackInsts()) {
1342     BuildVector
1343       // FIXME: Should probably widen s1 vectors straight to s32
1344       .minScalarOrElt(0, S16)
1345       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1346       .minScalar(1, S32);
1347 
1348     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1349       .legalFor({V2S16, S32})
1350       .lower();
1351     BuildVector.minScalarOrElt(0, S32);
1352   } else {
1353     BuildVector.customFor({V2S16, S16});
1354     BuildVector.minScalarOrElt(0, S32);
1355 
1356     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1357       .customFor({V2S16, S32})
1358       .lower();
1359   }
1360 
1361   BuildVector.legalIf(isRegisterType(0));
1362 
1363   // FIXME: Clamp maximum size
1364   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1365     .legalIf(isRegisterType(0));
1366 
1367   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1368   // pre-legalize.
1369   if (ST.hasVOP3PInsts()) {
1370     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1371       .customFor({V2S16, V2S16})
1372       .lower();
1373   } else
1374     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1375 
1376   // Merge/Unmerge
1377   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1378     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1379     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1380 
1381     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1382       const LLT Ty = Query.Types[TypeIdx];
1383       if (Ty.isVector()) {
1384         const LLT &EltTy = Ty.getElementType();
1385         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1386           return true;
1387         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1388           return true;
1389       }
1390       return false;
1391     };
1392 
1393     auto &Builder = getActionDefinitionsBuilder(Op)
1394       .lowerFor({{S16, V2S16}})
1395       .lowerIf([=](const LegalityQuery &Query) {
1396           const LLT BigTy = Query.Types[BigTyIdx];
1397           return BigTy.getSizeInBits() == 32;
1398         })
1399       // Try to widen to s16 first for small types.
1400       // TODO: Only do this on targets with legal s16 shifts
1401       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1402       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1403       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1404       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1405                            elementTypeIs(1, S16)),
1406                        changeTo(1, V2S16))
1407       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1408       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1409       // valid.
1410       .clampScalar(LitTyIdx, S32, S512)
1411       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1412       // Break up vectors with weird elements into scalars
1413       .fewerElementsIf(
1414         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1415         scalarize(0))
1416       .fewerElementsIf(
1417         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1418         scalarize(1))
1419       .clampScalar(BigTyIdx, S32, MaxScalar);
1420 
1421     if (Op == G_MERGE_VALUES) {
1422       Builder.widenScalarIf(
1423         // TODO: Use 16-bit shifts if legal for 8-bit values?
1424         [=](const LegalityQuery &Query) {
1425           const LLT Ty = Query.Types[LitTyIdx];
1426           return Ty.getSizeInBits() < 32;
1427         },
1428         changeTo(LitTyIdx, S32));
1429     }
1430 
1431     Builder.widenScalarIf(
1432       [=](const LegalityQuery &Query) {
1433         const LLT Ty = Query.Types[BigTyIdx];
1434         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1435           Ty.getSizeInBits() % 16 != 0;
1436       },
1437       [=](const LegalityQuery &Query) {
1438         // Pick the next power of 2, or a multiple of 64 over 128.
1439         // Whichever is smaller.
1440         const LLT &Ty = Query.Types[BigTyIdx];
1441         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1442         if (NewSizeInBits >= 256) {
1443           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1444           if (RoundedTo < NewSizeInBits)
1445             NewSizeInBits = RoundedTo;
1446         }
1447         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1448       })
1449       .legalIf([=](const LegalityQuery &Query) {
1450           const LLT &BigTy = Query.Types[BigTyIdx];
1451           const LLT &LitTy = Query.Types[LitTyIdx];
1452 
1453           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1454             return false;
1455           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1456             return false;
1457 
1458           return BigTy.getSizeInBits() % 16 == 0 &&
1459                  LitTy.getSizeInBits() % 16 == 0 &&
1460                  BigTy.getSizeInBits() <= MaxRegisterSize;
1461         })
1462       // Any vectors left are the wrong size. Scalarize them.
1463       .scalarize(0)
1464       .scalarize(1);
1465   }
1466 
1467   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1468   // RegBankSelect.
1469   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1470     .legalFor({{S32}, {S64}});
1471 
1472   if (ST.hasVOP3PInsts()) {
1473     SextInReg.lowerFor({{V2S16}})
1474       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1475       // get more vector shift opportunities, since we'll get those when
1476       // expanded.
1477       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1478   } else if (ST.has16BitInsts()) {
1479     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1480   } else {
1481     // Prefer to promote to s32 before lowering if we don't have 16-bit
1482     // shifts. This avoid a lot of intermediate truncate and extend operations.
1483     SextInReg.lowerFor({{S32}, {S64}});
1484   }
1485 
1486   SextInReg
1487     .scalarize(0)
1488     .clampScalar(0, S32, S64)
1489     .lower();
1490 
1491   getActionDefinitionsBuilder(G_FSHR)
1492     .legalFor({{S32, S32}})
1493     .scalarize(0)
1494     .lower();
1495 
1496   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1497     .legalFor({S64});
1498 
1499   getActionDefinitionsBuilder(G_FENCE)
1500     .alwaysLegal();
1501 
1502   getActionDefinitionsBuilder({
1503       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1504       G_FCOPYSIGN,
1505 
1506       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1507       G_ATOMICRMW_NAND,
1508       G_ATOMICRMW_FSUB,
1509       G_READ_REGISTER,
1510       G_WRITE_REGISTER,
1511 
1512       G_SADDO, G_SSUBO,
1513 
1514        // TODO: Implement
1515       G_FMINIMUM, G_FMAXIMUM,
1516       G_FSHL
1517     }).lower();
1518 
1519   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1520         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1521         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1522     .unsupported();
1523 
1524   computeTables();
1525   verify(*ST.getInstrInfo());
1526 }
1527 
1528 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1529                                          MachineInstr &MI) const {
1530   MachineIRBuilder &B = Helper.MIRBuilder;
1531   MachineRegisterInfo &MRI = *B.getMRI();
1532   GISelChangeObserver &Observer = Helper.Observer;
1533 
1534   switch (MI.getOpcode()) {
1535   case TargetOpcode::G_ADDRSPACE_CAST:
1536     return legalizeAddrSpaceCast(MI, MRI, B);
1537   case TargetOpcode::G_FRINT:
1538     return legalizeFrint(MI, MRI, B);
1539   case TargetOpcode::G_FCEIL:
1540     return legalizeFceil(MI, MRI, B);
1541   case TargetOpcode::G_INTRINSIC_TRUNC:
1542     return legalizeIntrinsicTrunc(MI, MRI, B);
1543   case TargetOpcode::G_SITOFP:
1544     return legalizeITOFP(MI, MRI, B, true);
1545   case TargetOpcode::G_UITOFP:
1546     return legalizeITOFP(MI, MRI, B, false);
1547   case TargetOpcode::G_FPTOSI:
1548     return legalizeFPTOI(MI, MRI, B, true);
1549   case TargetOpcode::G_FPTOUI:
1550     return legalizeFPTOI(MI, MRI, B, false);
1551   case TargetOpcode::G_FMINNUM:
1552   case TargetOpcode::G_FMAXNUM:
1553   case TargetOpcode::G_FMINNUM_IEEE:
1554   case TargetOpcode::G_FMAXNUM_IEEE:
1555     return legalizeMinNumMaxNum(Helper, MI);
1556   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1557     return legalizeExtractVectorElt(MI, MRI, B);
1558   case TargetOpcode::G_INSERT_VECTOR_ELT:
1559     return legalizeInsertVectorElt(MI, MRI, B);
1560   case TargetOpcode::G_SHUFFLE_VECTOR:
1561     return legalizeShuffleVector(MI, MRI, B);
1562   case TargetOpcode::G_FSIN:
1563   case TargetOpcode::G_FCOS:
1564     return legalizeSinCos(MI, MRI, B);
1565   case TargetOpcode::G_GLOBAL_VALUE:
1566     return legalizeGlobalValue(MI, MRI, B);
1567   case TargetOpcode::G_LOAD:
1568     return legalizeLoad(MI, MRI, B, Observer);
1569   case TargetOpcode::G_FMAD:
1570     return legalizeFMad(MI, MRI, B);
1571   case TargetOpcode::G_FDIV:
1572     return legalizeFDIV(MI, MRI, B);
1573   case TargetOpcode::G_UDIV:
1574   case TargetOpcode::G_UREM:
1575     return legalizeUDIV_UREM(MI, MRI, B);
1576   case TargetOpcode::G_SDIV:
1577   case TargetOpcode::G_SREM:
1578     return legalizeSDIV_SREM(MI, MRI, B);
1579   case TargetOpcode::G_ATOMIC_CMPXCHG:
1580     return legalizeAtomicCmpXChg(MI, MRI, B);
1581   case TargetOpcode::G_FLOG:
1582     return legalizeFlog(MI, B, numbers::ln2f);
1583   case TargetOpcode::G_FLOG10:
1584     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1585   case TargetOpcode::G_FEXP:
1586     return legalizeFExp(MI, B);
1587   case TargetOpcode::G_FPOW:
1588     return legalizeFPow(MI, B);
1589   case TargetOpcode::G_FFLOOR:
1590     return legalizeFFloor(MI, MRI, B);
1591   case TargetOpcode::G_BUILD_VECTOR:
1592     return legalizeBuildVector(MI, MRI, B);
1593   default:
1594     return false;
1595   }
1596 
1597   llvm_unreachable("expected switch to return");
1598 }
1599 
1600 Register AMDGPULegalizerInfo::getSegmentAperture(
1601   unsigned AS,
1602   MachineRegisterInfo &MRI,
1603   MachineIRBuilder &B) const {
1604   MachineFunction &MF = B.getMF();
1605   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1606   const LLT S32 = LLT::scalar(32);
1607 
1608   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1609 
1610   if (ST.hasApertureRegs()) {
1611     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1612     // getreg.
1613     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1614         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1615         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1616     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1617         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1618         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1619     unsigned Encoding =
1620         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1621         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1622         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1623 
1624     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1625 
1626     B.buildInstr(AMDGPU::S_GETREG_B32)
1627       .addDef(GetReg)
1628       .addImm(Encoding);
1629     MRI.setType(GetReg, S32);
1630 
1631     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1632     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1633   }
1634 
1635   Register QueuePtr = MRI.createGenericVirtualRegister(
1636     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1637 
1638   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1639     return Register();
1640 
1641   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1642   // private_segment_aperture_base_hi.
1643   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1644 
1645   // TODO: can we be smarter about machine pointer info?
1646   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1647   MachineMemOperand *MMO = MF.getMachineMemOperand(
1648       PtrInfo,
1649       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1650           MachineMemOperand::MOInvariant,
1651       4, commonAlignment(Align(64), StructOffset));
1652 
1653   Register LoadAddr;
1654 
1655   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1656   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1657 }
1658 
1659 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1660   MachineInstr &MI, MachineRegisterInfo &MRI,
1661   MachineIRBuilder &B) const {
1662   MachineFunction &MF = B.getMF();
1663 
1664   const LLT S32 = LLT::scalar(32);
1665   Register Dst = MI.getOperand(0).getReg();
1666   Register Src = MI.getOperand(1).getReg();
1667 
1668   LLT DstTy = MRI.getType(Dst);
1669   LLT SrcTy = MRI.getType(Src);
1670   unsigned DestAS = DstTy.getAddressSpace();
1671   unsigned SrcAS = SrcTy.getAddressSpace();
1672 
1673   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1674   // vector element.
1675   assert(!DstTy.isVector());
1676 
1677   const AMDGPUTargetMachine &TM
1678     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1679 
1680   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1681   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1682     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1683     return true;
1684   }
1685 
1686   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1687     // Truncate.
1688     B.buildExtract(Dst, Src, 0);
1689     MI.eraseFromParent();
1690     return true;
1691   }
1692 
1693   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1694     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1695     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1696 
1697     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1698     // another. Merge operands are required to be the same type, but creating an
1699     // extra ptrtoint would be kind of pointless.
1700     auto HighAddr = B.buildConstant(
1701       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1702     B.buildMerge(Dst, {Src, HighAddr});
1703     MI.eraseFromParent();
1704     return true;
1705   }
1706 
1707   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1708     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1709            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1710     unsigned NullVal = TM.getNullPointerValue(DestAS);
1711 
1712     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1713     auto FlatNull = B.buildConstant(SrcTy, 0);
1714 
1715     // Extract low 32-bits of the pointer.
1716     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1717 
1718     auto CmpRes =
1719         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1720     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1721 
1722     MI.eraseFromParent();
1723     return true;
1724   }
1725 
1726   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1727     return false;
1728 
1729   if (!ST.hasFlatAddressSpace())
1730     return false;
1731 
1732   auto SegmentNull =
1733       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1734   auto FlatNull =
1735       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1736 
1737   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1738   if (!ApertureReg.isValid())
1739     return false;
1740 
1741   auto CmpRes =
1742       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1743 
1744   // Coerce the type of the low half of the result so we can use merge_values.
1745   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1746 
1747   // TODO: Should we allow mismatched types but matching sizes in merges to
1748   // avoid the ptrtoint?
1749   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1750   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1751 
1752   MI.eraseFromParent();
1753   return true;
1754 }
1755 
1756 bool AMDGPULegalizerInfo::legalizeFrint(
1757   MachineInstr &MI, MachineRegisterInfo &MRI,
1758   MachineIRBuilder &B) const {
1759   Register Src = MI.getOperand(1).getReg();
1760   LLT Ty = MRI.getType(Src);
1761   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1762 
1763   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1764   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1765 
1766   auto C1 = B.buildFConstant(Ty, C1Val);
1767   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1768 
1769   // TODO: Should this propagate fast-math-flags?
1770   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1771   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1772 
1773   auto C2 = B.buildFConstant(Ty, C2Val);
1774   auto Fabs = B.buildFAbs(Ty, Src);
1775 
1776   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1777   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1778   MI.eraseFromParent();
1779   return true;
1780 }
1781 
1782 bool AMDGPULegalizerInfo::legalizeFceil(
1783   MachineInstr &MI, MachineRegisterInfo &MRI,
1784   MachineIRBuilder &B) const {
1785 
1786   const LLT S1 = LLT::scalar(1);
1787   const LLT S64 = LLT::scalar(64);
1788 
1789   Register Src = MI.getOperand(1).getReg();
1790   assert(MRI.getType(Src) == S64);
1791 
1792   // result = trunc(src)
1793   // if (src > 0.0 && src != result)
1794   //   result += 1.0
1795 
1796   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1797 
1798   const auto Zero = B.buildFConstant(S64, 0.0);
1799   const auto One = B.buildFConstant(S64, 1.0);
1800   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1801   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1802   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1803   auto Add = B.buildSelect(S64, And, One, Zero);
1804 
1805   // TODO: Should this propagate fast-math-flags?
1806   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1807   return true;
1808 }
1809 
1810 static MachineInstrBuilder extractF64Exponent(Register Hi,
1811                                               MachineIRBuilder &B) {
1812   const unsigned FractBits = 52;
1813   const unsigned ExpBits = 11;
1814   LLT S32 = LLT::scalar(32);
1815 
1816   auto Const0 = B.buildConstant(S32, FractBits - 32);
1817   auto Const1 = B.buildConstant(S32, ExpBits);
1818 
1819   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1820     .addUse(Hi)
1821     .addUse(Const0.getReg(0))
1822     .addUse(Const1.getReg(0));
1823 
1824   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1825 }
1826 
1827 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1828   MachineInstr &MI, MachineRegisterInfo &MRI,
1829   MachineIRBuilder &B) const {
1830   const LLT S1 = LLT::scalar(1);
1831   const LLT S32 = LLT::scalar(32);
1832   const LLT S64 = LLT::scalar(64);
1833 
1834   Register Src = MI.getOperand(1).getReg();
1835   assert(MRI.getType(Src) == S64);
1836 
1837   // TODO: Should this use extract since the low half is unused?
1838   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1839   Register Hi = Unmerge.getReg(1);
1840 
1841   // Extract the upper half, since this is where we will find the sign and
1842   // exponent.
1843   auto Exp = extractF64Exponent(Hi, B);
1844 
1845   const unsigned FractBits = 52;
1846 
1847   // Extract the sign bit.
1848   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1849   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1850 
1851   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1852 
1853   const auto Zero32 = B.buildConstant(S32, 0);
1854 
1855   // Extend back to 64-bits.
1856   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1857 
1858   auto Shr = B.buildAShr(S64, FractMask, Exp);
1859   auto Not = B.buildNot(S64, Shr);
1860   auto Tmp0 = B.buildAnd(S64, Src, Not);
1861   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1862 
1863   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1864   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1865 
1866   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1867   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1868   MI.eraseFromParent();
1869   return true;
1870 }
1871 
1872 bool AMDGPULegalizerInfo::legalizeITOFP(
1873   MachineInstr &MI, MachineRegisterInfo &MRI,
1874   MachineIRBuilder &B, bool Signed) const {
1875 
1876   Register Dst = MI.getOperand(0).getReg();
1877   Register Src = MI.getOperand(1).getReg();
1878 
1879   const LLT S64 = LLT::scalar(64);
1880   const LLT S32 = LLT::scalar(32);
1881 
1882   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1883 
1884   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1885 
1886   auto CvtHi = Signed ?
1887     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1888     B.buildUITOFP(S64, Unmerge.getReg(1));
1889 
1890   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1891 
1892   auto ThirtyTwo = B.buildConstant(S32, 32);
1893   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1894     .addUse(CvtHi.getReg(0))
1895     .addUse(ThirtyTwo.getReg(0));
1896 
1897   // TODO: Should this propagate fast-math-flags?
1898   B.buildFAdd(Dst, LdExp, CvtLo);
1899   MI.eraseFromParent();
1900   return true;
1901 }
1902 
1903 // TODO: Copied from DAG implementation. Verify logic and document how this
1904 // actually works.
1905 bool AMDGPULegalizerInfo::legalizeFPTOI(
1906   MachineInstr &MI, MachineRegisterInfo &MRI,
1907   MachineIRBuilder &B, bool Signed) const {
1908 
1909   Register Dst = MI.getOperand(0).getReg();
1910   Register Src = MI.getOperand(1).getReg();
1911 
1912   const LLT S64 = LLT::scalar(64);
1913   const LLT S32 = LLT::scalar(32);
1914 
1915   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1916 
1917   unsigned Flags = MI.getFlags();
1918 
1919   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1920   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1921   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1922 
1923   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1924   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1925   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1926 
1927   auto Hi = Signed ?
1928     B.buildFPTOSI(S32, FloorMul) :
1929     B.buildFPTOUI(S32, FloorMul);
1930   auto Lo = B.buildFPTOUI(S32, Fma);
1931 
1932   B.buildMerge(Dst, { Lo, Hi });
1933   MI.eraseFromParent();
1934 
1935   return true;
1936 }
1937 
1938 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1939                                                MachineInstr &MI) const {
1940   MachineFunction &MF = Helper.MIRBuilder.getMF();
1941   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1942 
1943   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1944                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1945 
1946   // With ieee_mode disabled, the instructions have the correct behavior
1947   // already for G_FMINNUM/G_FMAXNUM
1948   if (!MFI->getMode().IEEE)
1949     return !IsIEEEOp;
1950 
1951   if (IsIEEEOp)
1952     return true;
1953 
1954   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1955 }
1956 
1957 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1958   MachineInstr &MI, MachineRegisterInfo &MRI,
1959   MachineIRBuilder &B) const {
1960   // TODO: Should move some of this into LegalizerHelper.
1961 
1962   // TODO: Promote dynamic indexing of s16 to s32
1963 
1964   // FIXME: Artifact combiner probably should have replaced the truncated
1965   // constant before this, so we shouldn't need
1966   // getConstantVRegValWithLookThrough.
1967   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1968     MI.getOperand(2).getReg(), MRI);
1969   if (!IdxVal) // Dynamic case will be selected to register indexing.
1970     return true;
1971 
1972   Register Dst = MI.getOperand(0).getReg();
1973   Register Vec = MI.getOperand(1).getReg();
1974 
1975   LLT VecTy = MRI.getType(Vec);
1976   LLT EltTy = VecTy.getElementType();
1977   assert(EltTy == MRI.getType(Dst));
1978 
1979   if (IdxVal->Value < VecTy.getNumElements())
1980     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1981   else
1982     B.buildUndef(Dst);
1983 
1984   MI.eraseFromParent();
1985   return true;
1986 }
1987 
1988 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1989   MachineInstr &MI, MachineRegisterInfo &MRI,
1990   MachineIRBuilder &B) const {
1991   // TODO: Should move some of this into LegalizerHelper.
1992 
1993   // TODO: Promote dynamic indexing of s16 to s32
1994 
1995   // FIXME: Artifact combiner probably should have replaced the truncated
1996   // constant before this, so we shouldn't need
1997   // getConstantVRegValWithLookThrough.
1998   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1999     MI.getOperand(3).getReg(), MRI);
2000   if (!IdxVal) // Dynamic case will be selected to register indexing.
2001     return true;
2002 
2003   Register Dst = MI.getOperand(0).getReg();
2004   Register Vec = MI.getOperand(1).getReg();
2005   Register Ins = MI.getOperand(2).getReg();
2006 
2007   LLT VecTy = MRI.getType(Vec);
2008   LLT EltTy = VecTy.getElementType();
2009   assert(EltTy == MRI.getType(Ins));
2010 
2011   if (IdxVal->Value < VecTy.getNumElements())
2012     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2013   else
2014     B.buildUndef(Dst);
2015 
2016   MI.eraseFromParent();
2017   return true;
2018 }
2019 
2020 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2021   MachineInstr &MI, MachineRegisterInfo &MRI,
2022   MachineIRBuilder &B) const {
2023   const LLT V2S16 = LLT::vector(2, 16);
2024 
2025   Register Dst = MI.getOperand(0).getReg();
2026   Register Src0 = MI.getOperand(1).getReg();
2027   LLT DstTy = MRI.getType(Dst);
2028   LLT SrcTy = MRI.getType(Src0);
2029 
2030   if (SrcTy == V2S16 && DstTy == V2S16 &&
2031       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2032     return true;
2033 
2034   MachineIRBuilder HelperBuilder(MI);
2035   GISelObserverWrapper DummyObserver;
2036   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2037   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2038 }
2039 
2040 bool AMDGPULegalizerInfo::legalizeSinCos(
2041   MachineInstr &MI, MachineRegisterInfo &MRI,
2042   MachineIRBuilder &B) const {
2043 
2044   Register DstReg = MI.getOperand(0).getReg();
2045   Register SrcReg = MI.getOperand(1).getReg();
2046   LLT Ty = MRI.getType(DstReg);
2047   unsigned Flags = MI.getFlags();
2048 
2049   Register TrigVal;
2050   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2051   if (ST.hasTrigReducedRange()) {
2052     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2053     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2054       .addUse(MulVal.getReg(0))
2055       .setMIFlags(Flags).getReg(0);
2056   } else
2057     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2058 
2059   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2060     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2061   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2062     .addUse(TrigVal)
2063     .setMIFlags(Flags);
2064   MI.eraseFromParent();
2065   return true;
2066 }
2067 
2068 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2069                                                   MachineIRBuilder &B,
2070                                                   const GlobalValue *GV,
2071                                                   int64_t Offset,
2072                                                   unsigned GAFlags) const {
2073   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2074   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2075   // to the following code sequence:
2076   //
2077   // For constant address space:
2078   //   s_getpc_b64 s[0:1]
2079   //   s_add_u32 s0, s0, $symbol
2080   //   s_addc_u32 s1, s1, 0
2081   //
2082   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2083   //   a fixup or relocation is emitted to replace $symbol with a literal
2084   //   constant, which is a pc-relative offset from the encoding of the $symbol
2085   //   operand to the global variable.
2086   //
2087   // For global address space:
2088   //   s_getpc_b64 s[0:1]
2089   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2090   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2091   //
2092   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2093   //   fixups or relocations are emitted to replace $symbol@*@lo and
2094   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2095   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2096   //   operand to the global variable.
2097   //
2098   // What we want here is an offset from the value returned by s_getpc
2099   // (which is the address of the s_add_u32 instruction) to the global
2100   // variable, but since the encoding of $symbol starts 4 bytes after the start
2101   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2102   // small. This requires us to add 4 to the global variable offset in order to
2103   // compute the correct address.
2104 
2105   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2106 
2107   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2108     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2109 
2110   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2111     .addDef(PCReg);
2112 
2113   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2114   if (GAFlags == SIInstrInfo::MO_NONE)
2115     MIB.addImm(0);
2116   else
2117     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2118 
2119   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2120 
2121   if (PtrTy.getSizeInBits() == 32)
2122     B.buildExtract(DstReg, PCReg, 0);
2123   return true;
2124  }
2125 
2126 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2127   MachineInstr &MI, MachineRegisterInfo &MRI,
2128   MachineIRBuilder &B) const {
2129   Register DstReg = MI.getOperand(0).getReg();
2130   LLT Ty = MRI.getType(DstReg);
2131   unsigned AS = Ty.getAddressSpace();
2132 
2133   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2134   MachineFunction &MF = B.getMF();
2135   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2136 
2137   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2138     if (!MFI->isEntryFunction()) {
2139       const Function &Fn = MF.getFunction();
2140       DiagnosticInfoUnsupported BadLDSDecl(
2141         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2142         DS_Warning);
2143       Fn.getContext().diagnose(BadLDSDecl);
2144 
2145       // We currently don't have a way to correctly allocate LDS objects that
2146       // aren't directly associated with a kernel. We do force inlining of
2147       // functions that use local objects. However, if these dead functions are
2148       // not eliminated, we don't want a compile time error. Just emit a warning
2149       // and a trap, since there should be no callable path here.
2150       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2151       B.buildUndef(DstReg);
2152       MI.eraseFromParent();
2153       return true;
2154     }
2155 
2156     // TODO: We could emit code to handle the initialization somewhere.
2157     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2158       const SITargetLowering *TLI = ST.getTargetLowering();
2159       if (!TLI->shouldUseLDSConstAddress(GV)) {
2160         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2161         return true; // Leave in place;
2162       }
2163 
2164       B.buildConstant(
2165           DstReg,
2166           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2167       MI.eraseFromParent();
2168       return true;
2169     }
2170 
2171     const Function &Fn = MF.getFunction();
2172     DiagnosticInfoUnsupported BadInit(
2173       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2174     Fn.getContext().diagnose(BadInit);
2175     return true;
2176   }
2177 
2178   const SITargetLowering *TLI = ST.getTargetLowering();
2179 
2180   if (TLI->shouldEmitFixup(GV)) {
2181     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2182     MI.eraseFromParent();
2183     return true;
2184   }
2185 
2186   if (TLI->shouldEmitPCReloc(GV)) {
2187     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2188     MI.eraseFromParent();
2189     return true;
2190   }
2191 
2192   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2193   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2194 
2195   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2196       MachinePointerInfo::getGOT(MF),
2197       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2198           MachineMemOperand::MOInvariant,
2199       8 /*Size*/, Align(8));
2200 
2201   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2202 
2203   if (Ty.getSizeInBits() == 32) {
2204     // Truncate if this is a 32-bit constant adrdess.
2205     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2206     B.buildExtract(DstReg, Load, 0);
2207   } else
2208     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2209 
2210   MI.eraseFromParent();
2211   return true;
2212 }
2213 
2214 bool AMDGPULegalizerInfo::legalizeLoad(
2215   MachineInstr &MI, MachineRegisterInfo &MRI,
2216   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2217   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2218   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2219   Observer.changingInstr(MI);
2220   MI.getOperand(1).setReg(Cast.getReg(0));
2221   Observer.changedInstr(MI);
2222   return true;
2223 }
2224 
2225 bool AMDGPULegalizerInfo::legalizeFMad(
2226   MachineInstr &MI, MachineRegisterInfo &MRI,
2227   MachineIRBuilder &B) const {
2228   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2229   assert(Ty.isScalar());
2230 
2231   MachineFunction &MF = B.getMF();
2232   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2233 
2234   // TODO: Always legal with future ftz flag.
2235   // FIXME: Do we need just output?
2236   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2237     return true;
2238   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2239     return true;
2240 
2241   MachineIRBuilder HelperBuilder(MI);
2242   GISelObserverWrapper DummyObserver;
2243   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2244   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2245 }
2246 
2247 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2248   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2249   Register DstReg = MI.getOperand(0).getReg();
2250   Register PtrReg = MI.getOperand(1).getReg();
2251   Register CmpVal = MI.getOperand(2).getReg();
2252   Register NewVal = MI.getOperand(3).getReg();
2253 
2254   assert(SITargetLowering::isFlatGlobalAddrSpace(
2255            MRI.getType(PtrReg).getAddressSpace()) &&
2256          "this should not have been custom lowered");
2257 
2258   LLT ValTy = MRI.getType(CmpVal);
2259   LLT VecTy = LLT::vector(2, ValTy);
2260 
2261   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2262 
2263   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2264     .addDef(DstReg)
2265     .addUse(PtrReg)
2266     .addUse(PackedVal)
2267     .setMemRefs(MI.memoperands());
2268 
2269   MI.eraseFromParent();
2270   return true;
2271 }
2272 
2273 bool AMDGPULegalizerInfo::legalizeFlog(
2274   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2275   Register Dst = MI.getOperand(0).getReg();
2276   Register Src = MI.getOperand(1).getReg();
2277   LLT Ty = B.getMRI()->getType(Dst);
2278   unsigned Flags = MI.getFlags();
2279 
2280   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2281   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2282 
2283   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2284   MI.eraseFromParent();
2285   return true;
2286 }
2287 
2288 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2289                                        MachineIRBuilder &B) const {
2290   Register Dst = MI.getOperand(0).getReg();
2291   Register Src = MI.getOperand(1).getReg();
2292   unsigned Flags = MI.getFlags();
2293   LLT Ty = B.getMRI()->getType(Dst);
2294 
2295   auto K = B.buildFConstant(Ty, numbers::log2e);
2296   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2297   B.buildFExp2(Dst, Mul, Flags);
2298   MI.eraseFromParent();
2299   return true;
2300 }
2301 
2302 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2303                                        MachineIRBuilder &B) const {
2304   Register Dst = MI.getOperand(0).getReg();
2305   Register Src0 = MI.getOperand(1).getReg();
2306   Register Src1 = MI.getOperand(2).getReg();
2307   unsigned Flags = MI.getFlags();
2308   LLT Ty = B.getMRI()->getType(Dst);
2309   const LLT S16 = LLT::scalar(16);
2310   const LLT S32 = LLT::scalar(32);
2311 
2312   if (Ty == S32) {
2313     auto Log = B.buildFLog2(S32, Src0, Flags);
2314     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2315       .addUse(Log.getReg(0))
2316       .addUse(Src1)
2317       .setMIFlags(Flags);
2318     B.buildFExp2(Dst, Mul, Flags);
2319   } else if (Ty == S16) {
2320     // There's no f16 fmul_legacy, so we need to convert for it.
2321     auto Log = B.buildFLog2(S16, Src0, Flags);
2322     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2323     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2324     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2325       .addUse(Ext0.getReg(0))
2326       .addUse(Ext1.getReg(0))
2327       .setMIFlags(Flags);
2328 
2329     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2330   } else
2331     return false;
2332 
2333   MI.eraseFromParent();
2334   return true;
2335 }
2336 
2337 // Find a source register, ignoring any possible source modifiers.
2338 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2339   Register ModSrc = OrigSrc;
2340   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2341     ModSrc = SrcFNeg->getOperand(1).getReg();
2342     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2343       ModSrc = SrcFAbs->getOperand(1).getReg();
2344   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2345     ModSrc = SrcFAbs->getOperand(1).getReg();
2346   return ModSrc;
2347 }
2348 
2349 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2350                                          MachineRegisterInfo &MRI,
2351                                          MachineIRBuilder &B) const {
2352 
2353   const LLT S1 = LLT::scalar(1);
2354   const LLT S64 = LLT::scalar(64);
2355   Register Dst = MI.getOperand(0).getReg();
2356   Register OrigSrc = MI.getOperand(1).getReg();
2357   unsigned Flags = MI.getFlags();
2358   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2359          "this should not have been custom lowered");
2360 
2361   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2362   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2363   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2364   // V_FRACT bug is:
2365   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2366   //
2367   // Convert floor(x) to (x - fract(x))
2368 
2369   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2370     .addUse(OrigSrc)
2371     .setMIFlags(Flags);
2372 
2373   // Give source modifier matching some assistance before obscuring a foldable
2374   // pattern.
2375 
2376   // TODO: We can avoid the neg on the fract? The input sign to fract
2377   // shouldn't matter?
2378   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2379 
2380   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2381 
2382   Register Min = MRI.createGenericVirtualRegister(S64);
2383 
2384   // We don't need to concern ourselves with the snan handling difference, so
2385   // use the one which will directly select.
2386   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2387   if (MFI->getMode().IEEE)
2388     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2389   else
2390     B.buildFMinNum(Min, Fract, Const, Flags);
2391 
2392   Register CorrectedFract = Min;
2393   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2394     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2395     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2396   }
2397 
2398   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2399   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2400 
2401   MI.eraseFromParent();
2402   return true;
2403 }
2404 
2405 // Turn an illegal packed v2s16 build vector into bit operations.
2406 // TODO: This should probably be a bitcast action in LegalizerHelper.
2407 bool AMDGPULegalizerInfo::legalizeBuildVector(
2408   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2409   Register Dst = MI.getOperand(0).getReg();
2410   const LLT S32 = LLT::scalar(32);
2411   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2412 
2413   Register Src0 = MI.getOperand(1).getReg();
2414   Register Src1 = MI.getOperand(2).getReg();
2415   assert(MRI.getType(Src0) == LLT::scalar(16));
2416 
2417   auto Merge = B.buildMerge(S32, {Src0, Src1});
2418   B.buildBitcast(Dst, Merge);
2419 
2420   MI.eraseFromParent();
2421   return true;
2422 }
2423 
2424 // Return the use branch instruction, otherwise null if the usage is invalid.
2425 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2426                                        MachineRegisterInfo &MRI,
2427                                        MachineInstr *&Br,
2428                                        MachineBasicBlock *&UncondBrTarget) {
2429   Register CondDef = MI.getOperand(0).getReg();
2430   if (!MRI.hasOneNonDBGUse(CondDef))
2431     return nullptr;
2432 
2433   MachineBasicBlock *Parent = MI.getParent();
2434   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2435   if (UseMI.getParent() != Parent ||
2436       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2437     return nullptr;
2438 
2439   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2440   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2441   if (Next == Parent->end()) {
2442     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2443     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2444       return nullptr;
2445     UncondBrTarget = &*NextMBB;
2446   } else {
2447     if (Next->getOpcode() != AMDGPU::G_BR)
2448       return nullptr;
2449     Br = &*Next;
2450     UncondBrTarget = Br->getOperand(0).getMBB();
2451   }
2452 
2453   return &UseMI;
2454 }
2455 
2456 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2457                                                MachineRegisterInfo &MRI,
2458                                                Register LiveIn,
2459                                                Register PhyReg) const {
2460   assert(PhyReg.isPhysical() && "Physical register expected");
2461 
2462   // Insert the live-in copy, if required, by defining destination virtual
2463   // register.
2464   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2465   if (!MRI.getVRegDef(LiveIn)) {
2466     // FIXME: Should have scoped insert pt
2467     MachineBasicBlock &OrigInsBB = B.getMBB();
2468     auto OrigInsPt = B.getInsertPt();
2469 
2470     MachineBasicBlock &EntryMBB = B.getMF().front();
2471     EntryMBB.addLiveIn(PhyReg);
2472     B.setInsertPt(EntryMBB, EntryMBB.begin());
2473     B.buildCopy(LiveIn, PhyReg);
2474 
2475     B.setInsertPt(OrigInsBB, OrigInsPt);
2476   }
2477 
2478   return LiveIn;
2479 }
2480 
2481 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2482                                                 MachineRegisterInfo &MRI,
2483                                                 Register PhyReg, LLT Ty,
2484                                                 bool InsertLiveInCopy) const {
2485   assert(PhyReg.isPhysical() && "Physical register expected");
2486 
2487   // Get or create virtual live-in regester
2488   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2489   if (!LiveIn) {
2490     LiveIn = MRI.createGenericVirtualRegister(Ty);
2491     MRI.addLiveIn(PhyReg, LiveIn);
2492   }
2493 
2494   // When the actual true copy required is from virtual register to physical
2495   // register (to be inserted later), live-in copy insertion from physical
2496   // to register virtual register is not required
2497   if (!InsertLiveInCopy)
2498     return LiveIn;
2499 
2500   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2501 }
2502 
2503 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2504                                          const ArgDescriptor *Arg,
2505                                          const TargetRegisterClass *ArgRC,
2506                                          LLT ArgTy) const {
2507   MCRegister SrcReg = Arg->getRegister();
2508   assert(SrcReg.isPhysical() && "Physical register expected");
2509   assert(DstReg.isVirtual() && "Virtual register expected");
2510 
2511   MachineRegisterInfo &MRI = *B.getMRI();
2512   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy);
2513 
2514   if (Arg->isMasked()) {
2515     // TODO: Should we try to emit this once in the entry block?
2516     const LLT S32 = LLT::scalar(32);
2517     const unsigned Mask = Arg->getMask();
2518     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2519 
2520     Register AndMaskSrc = LiveIn;
2521 
2522     if (Shift != 0) {
2523       auto ShiftAmt = B.buildConstant(S32, Shift);
2524       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2525     }
2526 
2527     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2528   } else {
2529     B.buildCopy(DstReg, LiveIn);
2530   }
2531 
2532   return true;
2533 }
2534 
2535 bool AMDGPULegalizerInfo::loadInputValue(
2536     Register DstReg, MachineIRBuilder &B,
2537     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2538   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2539   const ArgDescriptor *Arg;
2540   const TargetRegisterClass *ArgRC;
2541   LLT ArgTy;
2542   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2543 
2544   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2545     return false; // TODO: Handle these
2546   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2547 }
2548 
2549 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2550     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2551     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2552   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2553     return false;
2554 
2555   MI.eraseFromParent();
2556   return true;
2557 }
2558 
2559 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2560                                        MachineRegisterInfo &MRI,
2561                                        MachineIRBuilder &B) const {
2562   Register Dst = MI.getOperand(0).getReg();
2563   LLT DstTy = MRI.getType(Dst);
2564   LLT S16 = LLT::scalar(16);
2565   LLT S32 = LLT::scalar(32);
2566   LLT S64 = LLT::scalar(64);
2567 
2568   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2569     return true;
2570 
2571   if (DstTy == S16)
2572     return legalizeFDIV16(MI, MRI, B);
2573   if (DstTy == S32)
2574     return legalizeFDIV32(MI, MRI, B);
2575   if (DstTy == S64)
2576     return legalizeFDIV64(MI, MRI, B);
2577 
2578   return false;
2579 }
2580 
2581 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2582                                                   Register DstReg,
2583                                                   Register X,
2584                                                   Register Y,
2585                                                   bool IsDiv) const {
2586   const LLT S1 = LLT::scalar(1);
2587   const LLT S32 = LLT::scalar(32);
2588 
2589   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2590   // algorithm used here.
2591 
2592   // Initial estimate of inv(y).
2593   auto FloatY = B.buildUITOFP(S32, Y);
2594   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2595   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2596   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2597   auto Z = B.buildFPTOUI(S32, ScaledY);
2598 
2599   // One round of UNR.
2600   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2601   auto NegYZ = B.buildMul(S32, NegY, Z);
2602   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2603 
2604   // Quotient/remainder estimate.
2605   auto Q = B.buildUMulH(S32, X, Z);
2606   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2607 
2608   // First quotient/remainder refinement.
2609   auto One = B.buildConstant(S32, 1);
2610   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2611   if (IsDiv)
2612     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2613   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2614 
2615   // Second quotient/remainder refinement.
2616   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2617   if (IsDiv)
2618     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2619   else
2620     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2621 }
2622 
2623 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2624                                               MachineRegisterInfo &MRI,
2625                                               MachineIRBuilder &B) const {
2626   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2627   Register DstReg = MI.getOperand(0).getReg();
2628   Register Num = MI.getOperand(1).getReg();
2629   Register Den = MI.getOperand(2).getReg();
2630   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2631   MI.eraseFromParent();
2632   return true;
2633 }
2634 
2635 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2636 //
2637 // Return lo, hi of result
2638 //
2639 // %cvt.lo = G_UITOFP Val.lo
2640 // %cvt.hi = G_UITOFP Val.hi
2641 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2642 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2643 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2644 // %mul2 = G_FMUL %mul1, 2**(-32)
2645 // %trunc = G_INTRINSIC_TRUNC %mul2
2646 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2647 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2648 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2649                                                        Register Val) {
2650   const LLT S32 = LLT::scalar(32);
2651   auto Unmerge = B.buildUnmerge(S32, Val);
2652 
2653   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2654   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2655 
2656   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2657                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2658 
2659   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2660   auto Mul1 =
2661       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2662 
2663   // 2**(-32)
2664   auto Mul2 =
2665       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2666   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2667 
2668   // -(2**32)
2669   auto Mad2 = B.buildFMAD(S32, Trunc,
2670                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2671 
2672   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2673   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2674 
2675   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2676 }
2677 
2678 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2679                                                   Register DstReg,
2680                                                   Register Numer,
2681                                                   Register Denom,
2682                                                   bool IsDiv) const {
2683   const LLT S32 = LLT::scalar(32);
2684   const LLT S64 = LLT::scalar(64);
2685   const LLT S1 = LLT::scalar(1);
2686   Register RcpLo, RcpHi;
2687 
2688   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2689 
2690   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2691 
2692   auto Zero64 = B.buildConstant(S64, 0);
2693   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2694 
2695   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2696   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2697 
2698   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2699   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2700   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2701 
2702   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2703   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2704   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2705   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2706 
2707   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2708   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2709   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2710   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2711   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2712 
2713   auto Zero32 = B.buildConstant(S32, 0);
2714   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2715   auto Add2_HiC =
2716       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2717   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2718   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2719 
2720   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2721   Register NumerLo = UnmergeNumer.getReg(0);
2722   Register NumerHi = UnmergeNumer.getReg(1);
2723 
2724   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2725   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2726   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2727   Register Mul3_Lo = UnmergeMul3.getReg(0);
2728   Register Mul3_Hi = UnmergeMul3.getReg(1);
2729   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2730   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2731   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2732   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2733 
2734   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2735   Register DenomLo = UnmergeDenom.getReg(0);
2736   Register DenomHi = UnmergeDenom.getReg(1);
2737 
2738   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2739   auto C1 = B.buildSExt(S32, CmpHi);
2740 
2741   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2742   auto C2 = B.buildSExt(S32, CmpLo);
2743 
2744   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2745   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2746 
2747   // TODO: Here and below portions of the code can be enclosed into if/endif.
2748   // Currently control flow is unconditional and we have 4 selects after
2749   // potential endif to substitute PHIs.
2750 
2751   // if C3 != 0 ...
2752   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2753   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2754   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2755   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2756 
2757   auto One64 = B.buildConstant(S64, 1);
2758   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2759 
2760   auto C4 =
2761       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2762   auto C5 =
2763       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2764   auto C6 = B.buildSelect(
2765       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2766 
2767   // if (C6 != 0)
2768   auto Add4 = B.buildAdd(S64, Add3, One64);
2769   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2770 
2771   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2772   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2773   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2774 
2775   // endif C6
2776   // endif C3
2777 
2778   if (IsDiv) {
2779     auto Sel1 = B.buildSelect(
2780         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2781     B.buildSelect(DstReg,
2782                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2783   } else {
2784     auto Sel2 = B.buildSelect(
2785         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2786     B.buildSelect(DstReg,
2787                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2788   }
2789 }
2790 
2791 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2792                                             MachineRegisterInfo &MRI,
2793                                             MachineIRBuilder &B) const {
2794   const LLT S64 = LLT::scalar(64);
2795   const LLT S32 = LLT::scalar(32);
2796   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2797   Register DstReg = MI.getOperand(0).getReg();
2798   Register Num = MI.getOperand(1).getReg();
2799   Register Den = MI.getOperand(2).getReg();
2800   LLT Ty = MRI.getType(DstReg);
2801 
2802   if (Ty == S32)
2803     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2804   else if (Ty == S64)
2805     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2806   else
2807     return false;
2808 
2809   MI.eraseFromParent();
2810   return true;
2811 
2812 }
2813 
2814 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2815                                             MachineRegisterInfo &MRI,
2816                                             MachineIRBuilder &B) const {
2817   const LLT S64 = LLT::scalar(64);
2818   const LLT S32 = LLT::scalar(32);
2819 
2820   Register DstReg = MI.getOperand(0).getReg();
2821   const LLT Ty = MRI.getType(DstReg);
2822   if (Ty != S32 && Ty != S64)
2823     return false;
2824 
2825   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2826 
2827   Register LHS = MI.getOperand(1).getReg();
2828   Register RHS = MI.getOperand(2).getReg();
2829 
2830   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2831   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2832   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2833 
2834   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2835   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2836 
2837   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2838   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2839 
2840   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2841   if (Ty == S32)
2842     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2843   else
2844     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2845 
2846   Register Sign;
2847   if (IsDiv)
2848     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2849   else
2850     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2851 
2852   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2853   B.buildSub(DstReg, UDivRem, Sign);
2854 
2855   MI.eraseFromParent();
2856   return true;
2857 }
2858 
2859 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2860                                                  MachineRegisterInfo &MRI,
2861                                                  MachineIRBuilder &B) const {
2862   Register Res = MI.getOperand(0).getReg();
2863   Register LHS = MI.getOperand(1).getReg();
2864   Register RHS = MI.getOperand(2).getReg();
2865 
2866   uint16_t Flags = MI.getFlags();
2867 
2868   LLT ResTy = MRI.getType(Res);
2869   LLT S32 = LLT::scalar(32);
2870   LLT S64 = LLT::scalar(64);
2871 
2872   const MachineFunction &MF = B.getMF();
2873   bool Unsafe =
2874     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2875 
2876   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2877     return false;
2878 
2879   if (!Unsafe && ResTy == S32 &&
2880       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2881     return false;
2882 
2883   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2884     // 1 / x -> RCP(x)
2885     if (CLHS->isExactlyValue(1.0)) {
2886       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2887         .addUse(RHS)
2888         .setMIFlags(Flags);
2889 
2890       MI.eraseFromParent();
2891       return true;
2892     }
2893 
2894     // -1 / x -> RCP( FNEG(x) )
2895     if (CLHS->isExactlyValue(-1.0)) {
2896       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2897       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2898         .addUse(FNeg.getReg(0))
2899         .setMIFlags(Flags);
2900 
2901       MI.eraseFromParent();
2902       return true;
2903     }
2904   }
2905 
2906   // x / y -> x * (1.0 / y)
2907   if (Unsafe) {
2908     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2909       .addUse(RHS)
2910       .setMIFlags(Flags);
2911     B.buildFMul(Res, LHS, RCP, Flags);
2912 
2913     MI.eraseFromParent();
2914     return true;
2915   }
2916 
2917   return false;
2918 }
2919 
2920 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2921                                          MachineRegisterInfo &MRI,
2922                                          MachineIRBuilder &B) const {
2923   Register Res = MI.getOperand(0).getReg();
2924   Register LHS = MI.getOperand(1).getReg();
2925   Register RHS = MI.getOperand(2).getReg();
2926 
2927   uint16_t Flags = MI.getFlags();
2928 
2929   LLT S16 = LLT::scalar(16);
2930   LLT S32 = LLT::scalar(32);
2931 
2932   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2933   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2934 
2935   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2936     .addUse(RHSExt.getReg(0))
2937     .setMIFlags(Flags);
2938 
2939   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2940   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2941 
2942   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2943     .addUse(RDst.getReg(0))
2944     .addUse(RHS)
2945     .addUse(LHS)
2946     .setMIFlags(Flags);
2947 
2948   MI.eraseFromParent();
2949   return true;
2950 }
2951 
2952 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2953 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2954 static void toggleSPDenormMode(bool Enable,
2955                                MachineIRBuilder &B,
2956                                const GCNSubtarget &ST,
2957                                AMDGPU::SIModeRegisterDefaults Mode) {
2958   // Set SP denorm mode to this value.
2959   unsigned SPDenormMode =
2960     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2961 
2962   if (ST.hasDenormModeInst()) {
2963     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2964     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2965 
2966     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2967     B.buildInstr(AMDGPU::S_DENORM_MODE)
2968       .addImm(NewDenormModeValue);
2969 
2970   } else {
2971     // Select FP32 bit field in mode register.
2972     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2973                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2974                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2975 
2976     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2977       .addImm(SPDenormMode)
2978       .addImm(SPDenormModeBitField);
2979   }
2980 }
2981 
2982 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2983                                          MachineRegisterInfo &MRI,
2984                                          MachineIRBuilder &B) const {
2985   Register Res = MI.getOperand(0).getReg();
2986   Register LHS = MI.getOperand(1).getReg();
2987   Register RHS = MI.getOperand(2).getReg();
2988   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2989   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2990 
2991   uint16_t Flags = MI.getFlags();
2992 
2993   LLT S32 = LLT::scalar(32);
2994   LLT S1 = LLT::scalar(1);
2995 
2996   auto One = B.buildFConstant(S32, 1.0f);
2997 
2998   auto DenominatorScaled =
2999     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3000       .addUse(LHS)
3001       .addUse(RHS)
3002       .addImm(0)
3003       .setMIFlags(Flags);
3004   auto NumeratorScaled =
3005     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3006       .addUse(LHS)
3007       .addUse(RHS)
3008       .addImm(1)
3009       .setMIFlags(Flags);
3010 
3011   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3012     .addUse(DenominatorScaled.getReg(0))
3013     .setMIFlags(Flags);
3014   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3015 
3016   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3017   // aren't modeled as reading it.
3018   if (!Mode.allFP32Denormals())
3019     toggleSPDenormMode(true, B, ST, Mode);
3020 
3021   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3022   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3023   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3024   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3025   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3026   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3027 
3028   if (!Mode.allFP32Denormals())
3029     toggleSPDenormMode(false, B, ST, Mode);
3030 
3031   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3032     .addUse(Fma4.getReg(0))
3033     .addUse(Fma1.getReg(0))
3034     .addUse(Fma3.getReg(0))
3035     .addUse(NumeratorScaled.getReg(1))
3036     .setMIFlags(Flags);
3037 
3038   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3039     .addUse(Fmas.getReg(0))
3040     .addUse(RHS)
3041     .addUse(LHS)
3042     .setMIFlags(Flags);
3043 
3044   MI.eraseFromParent();
3045   return true;
3046 }
3047 
3048 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3049                                          MachineRegisterInfo &MRI,
3050                                          MachineIRBuilder &B) const {
3051   Register Res = MI.getOperand(0).getReg();
3052   Register LHS = MI.getOperand(1).getReg();
3053   Register RHS = MI.getOperand(2).getReg();
3054 
3055   uint16_t Flags = MI.getFlags();
3056 
3057   LLT S64 = LLT::scalar(64);
3058   LLT S1 = LLT::scalar(1);
3059 
3060   auto One = B.buildFConstant(S64, 1.0);
3061 
3062   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3063     .addUse(LHS)
3064     .addUse(RHS)
3065     .addImm(0)
3066     .setMIFlags(Flags);
3067 
3068   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3069 
3070   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3071     .addUse(DivScale0.getReg(0))
3072     .setMIFlags(Flags);
3073 
3074   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3075   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3076   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3077 
3078   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3079     .addUse(LHS)
3080     .addUse(RHS)
3081     .addImm(1)
3082     .setMIFlags(Flags);
3083 
3084   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3085   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3086   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3087 
3088   Register Scale;
3089   if (!ST.hasUsableDivScaleConditionOutput()) {
3090     // Workaround a hardware bug on SI where the condition output from div_scale
3091     // is not usable.
3092 
3093     LLT S32 = LLT::scalar(32);
3094 
3095     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3096     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3097     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3098     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3099 
3100     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3101                               Scale1Unmerge.getReg(1));
3102     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3103                               Scale0Unmerge.getReg(1));
3104     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3105   } else {
3106     Scale = DivScale1.getReg(1);
3107   }
3108 
3109   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3110     .addUse(Fma4.getReg(0))
3111     .addUse(Fma3.getReg(0))
3112     .addUse(Mul.getReg(0))
3113     .addUse(Scale)
3114     .setMIFlags(Flags);
3115 
3116   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3117     .addUse(Fmas.getReg(0))
3118     .addUse(RHS)
3119     .addUse(LHS)
3120     .setMIFlags(Flags);
3121 
3122   MI.eraseFromParent();
3123   return true;
3124 }
3125 
3126 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3127                                                  MachineRegisterInfo &MRI,
3128                                                  MachineIRBuilder &B) const {
3129   Register Res = MI.getOperand(0).getReg();
3130   Register LHS = MI.getOperand(2).getReg();
3131   Register RHS = MI.getOperand(3).getReg();
3132   uint16_t Flags = MI.getFlags();
3133 
3134   LLT S32 = LLT::scalar(32);
3135   LLT S1 = LLT::scalar(1);
3136 
3137   auto Abs = B.buildFAbs(S32, RHS, Flags);
3138   const APFloat C0Val(1.0f);
3139 
3140   auto C0 = B.buildConstant(S32, 0x6f800000);
3141   auto C1 = B.buildConstant(S32, 0x2f800000);
3142   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3143 
3144   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3145   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3146 
3147   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3148 
3149   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3150     .addUse(Mul0.getReg(0))
3151     .setMIFlags(Flags);
3152 
3153   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3154 
3155   B.buildFMul(Res, Sel, Mul1, Flags);
3156 
3157   MI.eraseFromParent();
3158   return true;
3159 }
3160 
3161 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3162                                             MachineRegisterInfo &MRI,
3163                                             MachineIRBuilder &B) const {
3164   uint64_t Offset =
3165     ST.getTargetLowering()->getImplicitParameterOffset(
3166       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3167   LLT DstTy = MRI.getType(DstReg);
3168   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3169 
3170   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3171   if (!loadInputValue(KernargPtrReg, B,
3172                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3173     return false;
3174 
3175   // FIXME: This should be nuw
3176   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3177   return true;
3178 }
3179 
3180 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3181                                                  MachineRegisterInfo &MRI,
3182                                                  MachineIRBuilder &B) const {
3183   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3184   if (!MFI->isEntryFunction()) {
3185     return legalizePreloadedArgIntrin(MI, MRI, B,
3186                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3187   }
3188 
3189   Register DstReg = MI.getOperand(0).getReg();
3190   if (!getImplicitArgPtr(DstReg, MRI, B))
3191     return false;
3192 
3193   MI.eraseFromParent();
3194   return true;
3195 }
3196 
3197 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3198                                               MachineRegisterInfo &MRI,
3199                                               MachineIRBuilder &B,
3200                                               unsigned AddrSpace) const {
3201   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3202   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3203   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3204   MI.eraseFromParent();
3205   return true;
3206 }
3207 
3208 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3209 // offset (the offset that is included in bounds checking and swizzling, to be
3210 // split between the instruction's voffset and immoffset fields) and soffset
3211 // (the offset that is excluded from bounds checking and swizzling, to go in
3212 // the instruction's soffset field).  This function takes the first kind of
3213 // offset and figures out how to split it between voffset and immoffset.
3214 std::tuple<Register, unsigned, unsigned>
3215 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3216                                         Register OrigOffset) const {
3217   const unsigned MaxImm = 4095;
3218   Register BaseReg;
3219   unsigned TotalConstOffset;
3220   MachineInstr *OffsetDef;
3221   const LLT S32 = LLT::scalar(32);
3222 
3223   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3224     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3225 
3226   unsigned ImmOffset = TotalConstOffset;
3227 
3228   // If the immediate value is too big for the immoffset field, put the value
3229   // and -4096 into the immoffset field so that the value that is copied/added
3230   // for the voffset field is a multiple of 4096, and it stands more chance
3231   // of being CSEd with the copy/add for another similar load/store.
3232   // However, do not do that rounding down to a multiple of 4096 if that is a
3233   // negative number, as it appears to be illegal to have a negative offset
3234   // in the vgpr, even if adding the immediate offset makes it positive.
3235   unsigned Overflow = ImmOffset & ~MaxImm;
3236   ImmOffset -= Overflow;
3237   if ((int32_t)Overflow < 0) {
3238     Overflow += ImmOffset;
3239     ImmOffset = 0;
3240   }
3241 
3242   if (Overflow != 0) {
3243     if (!BaseReg) {
3244       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3245     } else {
3246       auto OverflowVal = B.buildConstant(S32, Overflow);
3247       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3248     }
3249   }
3250 
3251   if (!BaseReg)
3252     BaseReg = B.buildConstant(S32, 0).getReg(0);
3253 
3254   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3255 }
3256 
3257 /// Handle register layout difference for f16 images for some subtargets.
3258 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3259                                              MachineRegisterInfo &MRI,
3260                                              Register Reg) const {
3261   if (!ST.hasUnpackedD16VMem())
3262     return Reg;
3263 
3264   const LLT S16 = LLT::scalar(16);
3265   const LLT S32 = LLT::scalar(32);
3266   LLT StoreVT = MRI.getType(Reg);
3267   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3268 
3269   auto Unmerge = B.buildUnmerge(S16, Reg);
3270 
3271   SmallVector<Register, 4> WideRegs;
3272   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3273     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3274 
3275   int NumElts = StoreVT.getNumElements();
3276 
3277   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3278 }
3279 
3280 Register AMDGPULegalizerInfo::fixStoreSourceType(
3281   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3282   MachineRegisterInfo *MRI = B.getMRI();
3283   LLT Ty = MRI->getType(VData);
3284 
3285   const LLT S16 = LLT::scalar(16);
3286 
3287   // Fixup illegal register types for i8 stores.
3288   if (Ty == LLT::scalar(8) || Ty == S16) {
3289     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3290     return AnyExt;
3291   }
3292 
3293   if (Ty.isVector()) {
3294     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3295       if (IsFormat)
3296         return handleD16VData(B, *MRI, VData);
3297     }
3298   }
3299 
3300   return VData;
3301 }
3302 
3303 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3304                                               MachineRegisterInfo &MRI,
3305                                               MachineIRBuilder &B,
3306                                               bool IsTyped,
3307                                               bool IsFormat) const {
3308   Register VData = MI.getOperand(1).getReg();
3309   LLT Ty = MRI.getType(VData);
3310   LLT EltTy = Ty.getScalarType();
3311   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3312   const LLT S32 = LLT::scalar(32);
3313 
3314   VData = fixStoreSourceType(B, VData, IsFormat);
3315   Register RSrc = MI.getOperand(2).getReg();
3316 
3317   MachineMemOperand *MMO = *MI.memoperands_begin();
3318   const int MemSize = MMO->getSize();
3319 
3320   unsigned ImmOffset;
3321   unsigned TotalOffset;
3322 
3323   // The typed intrinsics add an immediate after the registers.
3324   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3325 
3326   // The struct intrinsic variants add one additional operand over raw.
3327   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3328   Register VIndex;
3329   int OpOffset = 0;
3330   if (HasVIndex) {
3331     VIndex = MI.getOperand(3).getReg();
3332     OpOffset = 1;
3333   }
3334 
3335   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3336   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3337 
3338   unsigned Format = 0;
3339   if (IsTyped) {
3340     Format = MI.getOperand(5 + OpOffset).getImm();
3341     ++OpOffset;
3342   }
3343 
3344   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3345 
3346   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3347   if (TotalOffset != 0)
3348     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3349 
3350   unsigned Opc;
3351   if (IsTyped) {
3352     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3353                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3354   } else if (IsFormat) {
3355     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3356                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3357   } else {
3358     switch (MemSize) {
3359     case 1:
3360       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3361       break;
3362     case 2:
3363       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3364       break;
3365     default:
3366       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3367       break;
3368     }
3369   }
3370 
3371   if (!VIndex)
3372     VIndex = B.buildConstant(S32, 0).getReg(0);
3373 
3374   auto MIB = B.buildInstr(Opc)
3375     .addUse(VData)              // vdata
3376     .addUse(RSrc)               // rsrc
3377     .addUse(VIndex)             // vindex
3378     .addUse(VOffset)            // voffset
3379     .addUse(SOffset)            // soffset
3380     .addImm(ImmOffset);         // offset(imm)
3381 
3382   if (IsTyped)
3383     MIB.addImm(Format);
3384 
3385   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3386      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3387      .addMemOperand(MMO);
3388 
3389   MI.eraseFromParent();
3390   return true;
3391 }
3392 
3393 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3394                                              MachineRegisterInfo &MRI,
3395                                              MachineIRBuilder &B,
3396                                              bool IsFormat,
3397                                              bool IsTyped) const {
3398   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3399   MachineMemOperand *MMO = *MI.memoperands_begin();
3400   const int MemSize = MMO->getSize();
3401   const LLT S32 = LLT::scalar(32);
3402 
3403   Register Dst = MI.getOperand(0).getReg();
3404   Register RSrc = MI.getOperand(2).getReg();
3405 
3406   // The typed intrinsics add an immediate after the registers.
3407   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3408 
3409   // The struct intrinsic variants add one additional operand over raw.
3410   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3411   Register VIndex;
3412   int OpOffset = 0;
3413   if (HasVIndex) {
3414     VIndex = MI.getOperand(3).getReg();
3415     OpOffset = 1;
3416   }
3417 
3418   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3419   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3420 
3421   unsigned Format = 0;
3422   if (IsTyped) {
3423     Format = MI.getOperand(5 + OpOffset).getImm();
3424     ++OpOffset;
3425   }
3426 
3427   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3428   unsigned ImmOffset;
3429   unsigned TotalOffset;
3430 
3431   LLT Ty = MRI.getType(Dst);
3432   LLT EltTy = Ty.getScalarType();
3433   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3434   const bool Unpacked = ST.hasUnpackedD16VMem();
3435 
3436   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3437   if (TotalOffset != 0)
3438     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3439 
3440   unsigned Opc;
3441 
3442   if (IsTyped) {
3443     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3444                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3445   } else if (IsFormat) {
3446     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3447                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3448   } else {
3449     switch (MemSize) {
3450     case 1:
3451       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3452       break;
3453     case 2:
3454       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3455       break;
3456     default:
3457       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3458       break;
3459     }
3460   }
3461 
3462   Register LoadDstReg;
3463 
3464   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3465   LLT UnpackedTy = Ty.changeElementSize(32);
3466 
3467   if (IsExtLoad)
3468     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3469   else if (Unpacked && IsD16 && Ty.isVector())
3470     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3471   else
3472     LoadDstReg = Dst;
3473 
3474   if (!VIndex)
3475     VIndex = B.buildConstant(S32, 0).getReg(0);
3476 
3477   auto MIB = B.buildInstr(Opc)
3478     .addDef(LoadDstReg)         // vdata
3479     .addUse(RSrc)               // rsrc
3480     .addUse(VIndex)             // vindex
3481     .addUse(VOffset)            // voffset
3482     .addUse(SOffset)            // soffset
3483     .addImm(ImmOffset);         // offset(imm)
3484 
3485   if (IsTyped)
3486     MIB.addImm(Format);
3487 
3488   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3489      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3490      .addMemOperand(MMO);
3491 
3492   if (LoadDstReg != Dst) {
3493     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3494 
3495     // Widen result for extending loads was widened.
3496     if (IsExtLoad)
3497       B.buildTrunc(Dst, LoadDstReg);
3498     else {
3499       // Repack to original 16-bit vector result
3500       // FIXME: G_TRUNC should work, but legalization currently fails
3501       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3502       SmallVector<Register, 4> Repack;
3503       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3504         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3505       B.buildMerge(Dst, Repack);
3506     }
3507   }
3508 
3509   MI.eraseFromParent();
3510   return true;
3511 }
3512 
3513 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3514                                                MachineIRBuilder &B,
3515                                                bool IsInc) const {
3516   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3517                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3518   B.buildInstr(Opc)
3519     .addDef(MI.getOperand(0).getReg())
3520     .addUse(MI.getOperand(2).getReg())
3521     .addUse(MI.getOperand(3).getReg())
3522     .cloneMemRefs(MI);
3523   MI.eraseFromParent();
3524   return true;
3525 }
3526 
3527 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3528   switch (IntrID) {
3529   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3530   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3531     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3532   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3533   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3534     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3535   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3536   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3537     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3538   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3539   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3540     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3541   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3542   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3543     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3544   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3545   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3546     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3547   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3548   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3549     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3550   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3551   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3552     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3553   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3554   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3555     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3556   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3557   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3558     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3559   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3560   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3561     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3562   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3563   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3564     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3565   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3566   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3567     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3568   default:
3569     llvm_unreachable("unhandled atomic opcode");
3570   }
3571 }
3572 
3573 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3574                                                MachineIRBuilder &B,
3575                                                Intrinsic::ID IID) const {
3576   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3577                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3578 
3579   Register Dst = MI.getOperand(0).getReg();
3580   Register VData = MI.getOperand(2).getReg();
3581 
3582   Register CmpVal;
3583   int OpOffset = 0;
3584 
3585   if (IsCmpSwap) {
3586     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3587     ++OpOffset;
3588   }
3589 
3590   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3591   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3592 
3593   // The struct intrinsic variants add one additional operand over raw.
3594   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3595   Register VIndex;
3596   if (HasVIndex) {
3597     VIndex = MI.getOperand(4 + OpOffset).getReg();
3598     ++OpOffset;
3599   }
3600 
3601   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3602   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3603   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3604 
3605   MachineMemOperand *MMO = *MI.memoperands_begin();
3606 
3607   unsigned ImmOffset;
3608   unsigned TotalOffset;
3609   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3610   if (TotalOffset != 0)
3611     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3612 
3613   if (!VIndex)
3614     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3615 
3616   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3617     .addDef(Dst)
3618     .addUse(VData); // vdata
3619 
3620   if (IsCmpSwap)
3621     MIB.addReg(CmpVal);
3622 
3623   MIB.addUse(RSrc)               // rsrc
3624      .addUse(VIndex)             // vindex
3625      .addUse(VOffset)            // voffset
3626      .addUse(SOffset)            // soffset
3627      .addImm(ImmOffset)          // offset(imm)
3628      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3629      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3630      .addMemOperand(MMO);
3631 
3632   MI.eraseFromParent();
3633   return true;
3634 }
3635 
3636 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3637 /// vector with s16 typed elements.
3638 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3639                                         SmallVectorImpl<Register> &PackedAddrs,
3640                                         int AddrIdx, int DimIdx, int EndIdx,
3641                                         int NumGradients) {
3642   const LLT S16 = LLT::scalar(16);
3643   const LLT V2S16 = LLT::vector(2, 16);
3644 
3645   for (int I = AddrIdx; I < EndIdx; ++I) {
3646     MachineOperand &SrcOp = MI.getOperand(I);
3647     if (!SrcOp.isReg())
3648       continue; // _L to _LZ may have eliminated this.
3649 
3650     Register AddrReg = SrcOp.getReg();
3651 
3652     if (I < DimIdx) {
3653       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3654       PackedAddrs.push_back(AddrReg);
3655     } else {
3656       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3657       // derivatives dx/dh and dx/dv are packed with undef.
3658       if (((I + 1) >= EndIdx) ||
3659           ((NumGradients / 2) % 2 == 1 &&
3660            (I == DimIdx + (NumGradients / 2) - 1 ||
3661             I == DimIdx + NumGradients - 1)) ||
3662           // Check for _L to _LZ optimization
3663           !MI.getOperand(I + 1).isReg()) {
3664         PackedAddrs.push_back(
3665             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3666                 .getReg(0));
3667       } else {
3668         PackedAddrs.push_back(
3669             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3670                 .getReg(0));
3671         ++I;
3672       }
3673     }
3674   }
3675 }
3676 
3677 /// Convert from separate vaddr components to a single vector address register,
3678 /// and replace the remaining operands with $noreg.
3679 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3680                                      int DimIdx, int NumVAddrs) {
3681   const LLT S32 = LLT::scalar(32);
3682 
3683   SmallVector<Register, 8> AddrRegs;
3684   for (int I = 0; I != NumVAddrs; ++I) {
3685     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3686     if (SrcOp.isReg()) {
3687       AddrRegs.push_back(SrcOp.getReg());
3688       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3689     }
3690   }
3691 
3692   int NumAddrRegs = AddrRegs.size();
3693   if (NumAddrRegs != 1) {
3694     // Round up to 8 elements for v5-v7
3695     // FIXME: Missing intermediate sized register classes and instructions.
3696     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3697       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3698       auto Undef = B.buildUndef(S32);
3699       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3700       NumAddrRegs = RoundedNumRegs;
3701     }
3702 
3703     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3704     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3705   }
3706 
3707   for (int I = 1; I != NumVAddrs; ++I) {
3708     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3709     if (SrcOp.isReg())
3710       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3711   }
3712 }
3713 
3714 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3715 ///
3716 /// Depending on the subtarget, load/store with 16-bit element data need to be
3717 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3718 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3719 /// registers.
3720 ///
3721 /// We don't want to directly select image instructions just yet, but also want
3722 /// to exposes all register repacking to the legalizer/combiners. We also don't
3723 /// want a selected instrution entering RegBankSelect. In order to avoid
3724 /// defining a multitude of intermediate image instructions, directly hack on
3725 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3726 /// now unnecessary arguments with $noreg.
3727 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3728     MachineInstr &MI, MachineIRBuilder &B,
3729     GISelChangeObserver &Observer,
3730     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3731 
3732   const int NumDefs = MI.getNumExplicitDefs();
3733   bool IsTFE = NumDefs == 2;
3734   // We are only processing the operands of d16 image operations on subtargets
3735   // that use the unpacked register layout, or need to repack the TFE result.
3736 
3737   // TODO: Do we need to guard against already legalized intrinsics?
3738   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3739     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3740 
3741   MachineRegisterInfo *MRI = B.getMRI();
3742   const LLT S32 = LLT::scalar(32);
3743   const LLT S16 = LLT::scalar(16);
3744   const LLT V2S16 = LLT::vector(2, 16);
3745 
3746   // Index of first address argument
3747   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3748 
3749   int NumVAddrs, NumGradients;
3750   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3751   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3752     getDMaskIdx(BaseOpcode, NumDefs);
3753   unsigned DMask = 0;
3754 
3755   // Check for 16 bit addresses and pack if true.
3756   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3757   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3758   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3759   const bool IsG16 = GradTy == S16;
3760   const bool IsA16 = AddrTy == S16;
3761 
3762   int DMaskLanes = 0;
3763   if (!BaseOpcode->Atomic) {
3764     DMask = MI.getOperand(DMaskIdx).getImm();
3765     if (BaseOpcode->Gather4) {
3766       DMaskLanes = 4;
3767     } else if (DMask != 0) {
3768       DMaskLanes = countPopulation(DMask);
3769     } else if (!IsTFE && !BaseOpcode->Store) {
3770       // If dmask is 0, this is a no-op load. This can be eliminated.
3771       B.buildUndef(MI.getOperand(0));
3772       MI.eraseFromParent();
3773       return true;
3774     }
3775   }
3776 
3777   Observer.changingInstr(MI);
3778   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3779 
3780   unsigned NewOpcode = NumDefs == 0 ?
3781     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3782 
3783   // Track that we legalized this
3784   MI.setDesc(B.getTII().get(NewOpcode));
3785 
3786   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3787   // dmask to be at least 1 otherwise the instruction will fail
3788   if (IsTFE && DMask == 0) {
3789     DMask = 0x1;
3790     DMaskLanes = 1;
3791     MI.getOperand(DMaskIdx).setImm(DMask);
3792   }
3793 
3794   if (BaseOpcode->Atomic) {
3795     Register VData0 = MI.getOperand(2).getReg();
3796     LLT Ty = MRI->getType(VData0);
3797 
3798     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3799     if (Ty.isVector())
3800       return false;
3801 
3802     if (BaseOpcode->AtomicX2) {
3803       Register VData1 = MI.getOperand(3).getReg();
3804       // The two values are packed in one register.
3805       LLT PackedTy = LLT::vector(2, Ty);
3806       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3807       MI.getOperand(2).setReg(Concat.getReg(0));
3808       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3809     }
3810   }
3811 
3812   int CorrectedNumVAddrs = NumVAddrs;
3813 
3814   // Optimize _L to _LZ when _L is zero
3815   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3816         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3817     const ConstantFP *ConstantLod;
3818     const int LodIdx = AddrIdx + NumVAddrs - 1;
3819 
3820     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3821       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3822         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3823         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3824           LZMappingInfo->LZ, ImageDimIntr->Dim);
3825 
3826         // The starting indexes should remain in the same place.
3827         --NumVAddrs;
3828         --CorrectedNumVAddrs;
3829 
3830         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3831           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3832         MI.RemoveOperand(LodIdx);
3833       }
3834     }
3835   }
3836 
3837   // Optimize _mip away, when 'lod' is zero
3838   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3839     int64_t ConstantLod;
3840     const int LodIdx = AddrIdx + NumVAddrs - 1;
3841 
3842     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3843       if (ConstantLod == 0) {
3844         // TODO: Change intrinsic opcode and remove operand instead or replacing
3845         // it with 0, as the _L to _LZ handling is done above.
3846         MI.getOperand(LodIdx).ChangeToImmediate(0);
3847         --CorrectedNumVAddrs;
3848       }
3849     }
3850   }
3851 
3852   // Rewrite the addressing register layout before doing anything else.
3853   if (IsA16 || IsG16) {
3854     if (IsA16) {
3855       // Target must support the feature and gradients need to be 16 bit too
3856       if (!ST.hasA16() || !IsG16)
3857         return false;
3858     } else if (!ST.hasG16())
3859       return false;
3860 
3861     if (NumVAddrs > 1) {
3862       SmallVector<Register, 4> PackedRegs;
3863       // Don't compress addresses for G16
3864       const int PackEndIdx =
3865           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3866       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3867                                   PackEndIdx, NumGradients);
3868 
3869       if (!IsA16) {
3870         // Add uncompressed address
3871         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3872           int AddrReg = MI.getOperand(I).getReg();
3873           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3874           PackedRegs.push_back(AddrReg);
3875         }
3876       }
3877 
3878       // See also below in the non-a16 branch
3879       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3880 
3881       if (!UseNSA && PackedRegs.size() > 1) {
3882         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3883         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3884         PackedRegs[0] = Concat.getReg(0);
3885         PackedRegs.resize(1);
3886       }
3887 
3888       const int NumPacked = PackedRegs.size();
3889       for (int I = 0; I != NumVAddrs; ++I) {
3890         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3891         if (!SrcOp.isReg()) {
3892           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3893           continue;
3894         }
3895 
3896         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3897 
3898         if (I < NumPacked)
3899           SrcOp.setReg(PackedRegs[I]);
3900         else
3901           SrcOp.setReg(AMDGPU::NoRegister);
3902       }
3903     }
3904   } else {
3905     // If the register allocator cannot place the address registers contiguously
3906     // without introducing moves, then using the non-sequential address encoding
3907     // is always preferable, since it saves VALU instructions and is usually a
3908     // wash in terms of code size or even better.
3909     //
3910     // However, we currently have no way of hinting to the register allocator
3911     // that MIMG addresses should be placed contiguously when it is possible to
3912     // do so, so force non-NSA for the common 2-address case as a heuristic.
3913     //
3914     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3915     // allocation when possible.
3916     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3917 
3918     if (!UseNSA && NumVAddrs > 1)
3919       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3920   }
3921 
3922   int Flags = 0;
3923   if (IsA16)
3924     Flags |= 1;
3925   if (IsG16)
3926     Flags |= 2;
3927   MI.addOperand(MachineOperand::CreateImm(Flags));
3928 
3929   if (BaseOpcode->Store) { // No TFE for stores?
3930     // TODO: Handle dmask trim
3931     Register VData = MI.getOperand(1).getReg();
3932     LLT Ty = MRI->getType(VData);
3933     if (!Ty.isVector() || Ty.getElementType() != S16)
3934       return true;
3935 
3936     Register RepackedReg = handleD16VData(B, *MRI, VData);
3937     if (RepackedReg != VData) {
3938       MI.getOperand(1).setReg(RepackedReg);
3939     }
3940 
3941     return true;
3942   }
3943 
3944   Register DstReg = MI.getOperand(0).getReg();
3945   LLT Ty = MRI->getType(DstReg);
3946   const LLT EltTy = Ty.getScalarType();
3947   const bool IsD16 = Ty.getScalarType() == S16;
3948   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3949 
3950   // Confirm that the return type is large enough for the dmask specified
3951   if (NumElts < DMaskLanes)
3952     return false;
3953 
3954   if (NumElts > 4 || DMaskLanes > 4)
3955     return false;
3956 
3957   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3958   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3959 
3960   // The raw dword aligned data component of the load. The only legal cases
3961   // where this matters should be when using the packed D16 format, for
3962   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3963   LLT RoundedTy;
3964 
3965   // S32 vector to to cover all data, plus TFE result element.
3966   LLT TFETy;
3967 
3968   // Register type to use for each loaded component. Will be S32 or V2S16.
3969   LLT RegTy;
3970 
3971   if (IsD16 && ST.hasUnpackedD16VMem()) {
3972     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3973     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3974     RegTy = S32;
3975   } else {
3976     unsigned EltSize = EltTy.getSizeInBits();
3977     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3978     unsigned RoundedSize = 32 * RoundedElts;
3979     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3980     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3981     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3982   }
3983 
3984   // The return type does not need adjustment.
3985   // TODO: Should we change s16 case to s32 or <2 x s16>?
3986   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3987     return true;
3988 
3989   Register Dst1Reg;
3990 
3991   // Insert after the instruction.
3992   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3993 
3994   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3995   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3996   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3997   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3998 
3999   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4000 
4001   MI.getOperand(0).setReg(NewResultReg);
4002 
4003   // In the IR, TFE is supposed to be used with a 2 element struct return
4004   // type. The intruction really returns these two values in one contiguous
4005   // register, with one additional dword beyond the loaded data. Rewrite the
4006   // return type to use a single register result.
4007 
4008   if (IsTFE) {
4009     Dst1Reg = MI.getOperand(1).getReg();
4010     if (MRI->getType(Dst1Reg) != S32)
4011       return false;
4012 
4013     // TODO: Make sure the TFE operand bit is set.
4014     MI.RemoveOperand(1);
4015 
4016     // Handle the easy case that requires no repack instructions.
4017     if (Ty == S32) {
4018       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4019       return true;
4020     }
4021   }
4022 
4023   // Now figure out how to copy the new result register back into the old
4024   // result.
4025   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4026 
4027   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4028 
4029   if (ResultNumRegs == 1) {
4030     assert(!IsTFE);
4031     ResultRegs[0] = NewResultReg;
4032   } else {
4033     // We have to repack into a new vector of some kind.
4034     for (int I = 0; I != NumDataRegs; ++I)
4035       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4036     B.buildUnmerge(ResultRegs, NewResultReg);
4037 
4038     // Drop the final TFE element to get the data part. The TFE result is
4039     // directly written to the right place already.
4040     if (IsTFE)
4041       ResultRegs.resize(NumDataRegs);
4042   }
4043 
4044   // For an s16 scalar result, we form an s32 result with a truncate regardless
4045   // of packed vs. unpacked.
4046   if (IsD16 && !Ty.isVector()) {
4047     B.buildTrunc(DstReg, ResultRegs[0]);
4048     return true;
4049   }
4050 
4051   // Avoid a build/concat_vector of 1 entry.
4052   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4053     B.buildBitcast(DstReg, ResultRegs[0]);
4054     return true;
4055   }
4056 
4057   assert(Ty.isVector());
4058 
4059   if (IsD16) {
4060     // For packed D16 results with TFE enabled, all the data components are
4061     // S32. Cast back to the expected type.
4062     //
4063     // TODO: We don't really need to use load s32 elements. We would only need one
4064     // cast for the TFE result if a multiple of v2s16 was used.
4065     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4066       for (Register &Reg : ResultRegs)
4067         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4068     } else if (ST.hasUnpackedD16VMem()) {
4069       for (Register &Reg : ResultRegs)
4070         Reg = B.buildTrunc(S16, Reg).getReg(0);
4071     }
4072   }
4073 
4074   auto padWithUndef = [&](LLT Ty, int NumElts) {
4075     if (NumElts == 0)
4076       return;
4077     Register Undef = B.buildUndef(Ty).getReg(0);
4078     for (int I = 0; I != NumElts; ++I)
4079       ResultRegs.push_back(Undef);
4080   };
4081 
4082   // Pad out any elements eliminated due to the dmask.
4083   LLT ResTy = MRI->getType(ResultRegs[0]);
4084   if (!ResTy.isVector()) {
4085     padWithUndef(ResTy, NumElts - ResultRegs.size());
4086     B.buildBuildVector(DstReg, ResultRegs);
4087     return true;
4088   }
4089 
4090   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4091   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4092 
4093   // Deal with the one annoying legal case.
4094   const LLT V3S16 = LLT::vector(3, 16);
4095   if (Ty == V3S16) {
4096     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4097     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4098     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4099     return true;
4100   }
4101 
4102   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4103   B.buildConcatVectors(DstReg, ResultRegs);
4104   return true;
4105 }
4106 
4107 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4108   MachineInstr &MI, MachineIRBuilder &B,
4109   GISelChangeObserver &Observer) const {
4110   Register Dst = MI.getOperand(0).getReg();
4111   LLT Ty = B.getMRI()->getType(Dst);
4112   unsigned Size = Ty.getSizeInBits();
4113   MachineFunction &MF = B.getMF();
4114 
4115   Observer.changingInstr(MI);
4116 
4117   // FIXME: We don't really need this intermediate instruction. The intrinsic
4118   // should be fixed to have a memory operand. Since it's readnone, we're not
4119   // allowed to add one.
4120   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4121   MI.RemoveOperand(1); // Remove intrinsic ID
4122 
4123   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4124   // TODO: Should this use datalayout alignment?
4125   const unsigned MemSize = (Size + 7) / 8;
4126   const Align MemAlign(4);
4127   MachineMemOperand *MMO = MF.getMachineMemOperand(
4128       MachinePointerInfo(),
4129       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4130           MachineMemOperand::MOInvariant,
4131       MemSize, MemAlign);
4132   MI.addMemOperand(MF, MMO);
4133 
4134   // There are no 96-bit result scalar loads, but widening to 128-bit should
4135   // always be legal. We may need to restore this to a 96-bit result if it turns
4136   // out this needs to be converted to a vector load during RegBankSelect.
4137   if (!isPowerOf2_32(Size)) {
4138     LegalizerHelper Helper(MF, *this, Observer, B);
4139 
4140     if (Ty.isVector())
4141       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4142     else
4143       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4144   }
4145 
4146   Observer.changedInstr(MI);
4147   return true;
4148 }
4149 
4150 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4151                                                 MachineRegisterInfo &MRI,
4152                                                 MachineIRBuilder &B) const {
4153   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4154   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4155       !ST.isTrapHandlerEnabled()) {
4156     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4157   } else {
4158     // Pass queue pointer to trap handler as input, and insert trap instruction
4159     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4160     MachineRegisterInfo &MRI = *B.getMRI();
4161     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4162     Register LiveIn = getLiveInRegister(
4163         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4164         /*InsertLiveInCopy=*/false);
4165     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4166       return false;
4167     B.buildCopy(SGPR01, LiveIn);
4168     B.buildInstr(AMDGPU::S_TRAP)
4169         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4170         .addReg(SGPR01, RegState::Implicit);
4171   }
4172 
4173   MI.eraseFromParent();
4174   return true;
4175 }
4176 
4177 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4178     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4179   // Is non-HSA path or trap-handler disabled? then, report a warning
4180   // accordingly
4181   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4182       !ST.isTrapHandlerEnabled()) {
4183     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4184                                      "debugtrap handler not supported",
4185                                      MI.getDebugLoc(), DS_Warning);
4186     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4187     Ctx.diagnose(NoTrap);
4188   } else {
4189     // Insert debug-trap instruction
4190     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4191   }
4192 
4193   MI.eraseFromParent();
4194   return true;
4195 }
4196 
4197 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4198                                             MachineInstr &MI) const {
4199   MachineIRBuilder &B = Helper.MIRBuilder;
4200   MachineRegisterInfo &MRI = *B.getMRI();
4201 
4202   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4203   auto IntrID = MI.getIntrinsicID();
4204   switch (IntrID) {
4205   case Intrinsic::amdgcn_if:
4206   case Intrinsic::amdgcn_else: {
4207     MachineInstr *Br = nullptr;
4208     MachineBasicBlock *UncondBrTarget = nullptr;
4209     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4210       const SIRegisterInfo *TRI
4211         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4212 
4213       Register Def = MI.getOperand(1).getReg();
4214       Register Use = MI.getOperand(3).getReg();
4215 
4216       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4217       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4218       if (IntrID == Intrinsic::amdgcn_if) {
4219         B.buildInstr(AMDGPU::SI_IF)
4220           .addDef(Def)
4221           .addUse(Use)
4222           .addMBB(UncondBrTarget);
4223       } else {
4224         B.buildInstr(AMDGPU::SI_ELSE)
4225           .addDef(Def)
4226           .addUse(Use)
4227           .addMBB(UncondBrTarget)
4228           .addImm(0);
4229       }
4230 
4231       if (Br) {
4232         Br->getOperand(0).setMBB(CondBrTarget);
4233       } else {
4234         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4235         // since we're swapping branch targets it needs to be reinserted.
4236         // FIXME: IRTranslator should probably not do this
4237         B.buildBr(*CondBrTarget);
4238       }
4239 
4240       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4241       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4242       MI.eraseFromParent();
4243       BrCond->eraseFromParent();
4244       return true;
4245     }
4246 
4247     return false;
4248   }
4249   case Intrinsic::amdgcn_loop: {
4250     MachineInstr *Br = nullptr;
4251     MachineBasicBlock *UncondBrTarget = nullptr;
4252     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4253       const SIRegisterInfo *TRI
4254         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4255 
4256       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4257       Register Reg = MI.getOperand(2).getReg();
4258 
4259       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4260       B.buildInstr(AMDGPU::SI_LOOP)
4261         .addUse(Reg)
4262         .addMBB(UncondBrTarget);
4263 
4264       if (Br)
4265         Br->getOperand(0).setMBB(CondBrTarget);
4266       else
4267         B.buildBr(*CondBrTarget);
4268 
4269       MI.eraseFromParent();
4270       BrCond->eraseFromParent();
4271       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4272       return true;
4273     }
4274 
4275     return false;
4276   }
4277   case Intrinsic::amdgcn_kernarg_segment_ptr:
4278     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4279       // This only makes sense to call in a kernel, so just lower to null.
4280       B.buildConstant(MI.getOperand(0).getReg(), 0);
4281       MI.eraseFromParent();
4282       return true;
4283     }
4284 
4285     return legalizePreloadedArgIntrin(
4286       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4287   case Intrinsic::amdgcn_implicitarg_ptr:
4288     return legalizeImplicitArgPtr(MI, MRI, B);
4289   case Intrinsic::amdgcn_workitem_id_x:
4290     return legalizePreloadedArgIntrin(MI, MRI, B,
4291                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4292   case Intrinsic::amdgcn_workitem_id_y:
4293     return legalizePreloadedArgIntrin(MI, MRI, B,
4294                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4295   case Intrinsic::amdgcn_workitem_id_z:
4296     return legalizePreloadedArgIntrin(MI, MRI, B,
4297                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4298   case Intrinsic::amdgcn_workgroup_id_x:
4299     return legalizePreloadedArgIntrin(MI, MRI, B,
4300                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4301   case Intrinsic::amdgcn_workgroup_id_y:
4302     return legalizePreloadedArgIntrin(MI, MRI, B,
4303                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4304   case Intrinsic::amdgcn_workgroup_id_z:
4305     return legalizePreloadedArgIntrin(MI, MRI, B,
4306                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4307   case Intrinsic::amdgcn_dispatch_ptr:
4308     return legalizePreloadedArgIntrin(MI, MRI, B,
4309                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4310   case Intrinsic::amdgcn_queue_ptr:
4311     return legalizePreloadedArgIntrin(MI, MRI, B,
4312                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4313   case Intrinsic::amdgcn_implicit_buffer_ptr:
4314     return legalizePreloadedArgIntrin(
4315       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4316   case Intrinsic::amdgcn_dispatch_id:
4317     return legalizePreloadedArgIntrin(MI, MRI, B,
4318                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4319   case Intrinsic::amdgcn_fdiv_fast:
4320     return legalizeFDIVFastIntrin(MI, MRI, B);
4321   case Intrinsic::amdgcn_is_shared:
4322     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4323   case Intrinsic::amdgcn_is_private:
4324     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4325   case Intrinsic::amdgcn_wavefrontsize: {
4326     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4327     MI.eraseFromParent();
4328     return true;
4329   }
4330   case Intrinsic::amdgcn_s_buffer_load:
4331     return legalizeSBufferLoad(MI, B, Helper.Observer);
4332   case Intrinsic::amdgcn_raw_buffer_store:
4333   case Intrinsic::amdgcn_struct_buffer_store:
4334     return legalizeBufferStore(MI, MRI, B, false, false);
4335   case Intrinsic::amdgcn_raw_buffer_store_format:
4336   case Intrinsic::amdgcn_struct_buffer_store_format:
4337     return legalizeBufferStore(MI, MRI, B, false, true);
4338   case Intrinsic::amdgcn_raw_tbuffer_store:
4339   case Intrinsic::amdgcn_struct_tbuffer_store:
4340     return legalizeBufferStore(MI, MRI, B, true, true);
4341   case Intrinsic::amdgcn_raw_buffer_load:
4342   case Intrinsic::amdgcn_struct_buffer_load:
4343     return legalizeBufferLoad(MI, MRI, B, false, false);
4344   case Intrinsic::amdgcn_raw_buffer_load_format:
4345   case Intrinsic::amdgcn_struct_buffer_load_format:
4346     return legalizeBufferLoad(MI, MRI, B, true, false);
4347   case Intrinsic::amdgcn_raw_tbuffer_load:
4348   case Intrinsic::amdgcn_struct_tbuffer_load:
4349     return legalizeBufferLoad(MI, MRI, B, true, true);
4350   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4351   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4352   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4353   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4354   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4355   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4356   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4357   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4358   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4359   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4360   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4361   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4362   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4363   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4364   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4365   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4366   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4367   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4368   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4369   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4370   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4371   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4372   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4373   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4374   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4375   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4376     return legalizeBufferAtomic(MI, B, IntrID);
4377   case Intrinsic::amdgcn_atomic_inc:
4378     return legalizeAtomicIncDec(MI, B, true);
4379   case Intrinsic::amdgcn_atomic_dec:
4380     return legalizeAtomicIncDec(MI, B, false);
4381   case Intrinsic::trap:
4382     return legalizeTrapIntrinsic(MI, MRI, B);
4383   case Intrinsic::debugtrap:
4384     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4385   default: {
4386     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4387             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4388       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4389     return true;
4390   }
4391   }
4392 
4393   return true;
4394 }
4395