1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .legalIf(isPointer(0))
419     .clampScalar(0, S32, S256)
420     .widenScalarToNextPow2(0, 32)
421     .clampMaxNumElements(0, S32, 16)
422     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
423     .scalarize(0);
424 
425   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
426     // Full set of gfx9 features.
427     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
428       .legalFor({S32, S16, V2S16})
429       .clampScalar(0, S16, S32)
430       .clampMaxNumElements(0, S16, 2)
431       .scalarize(0)
432       .widenScalarToNextPow2(0, 32);
433 
434     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
435       .legalFor({S32, S16, V2S16}) // Clamp modifier
436       .minScalar(0, S16)
437       .clampMaxNumElements(0, S16, 2)
438       .scalarize(0)
439       .widenScalarToNextPow2(0, 32)
440       .lower();
441   } else if (ST.has16BitInsts()) {
442     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
443       .legalFor({S32, S16})
444       .clampScalar(0, S16, S32)
445       .scalarize(0)
446       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
447 
448     // Technically the saturating operations require clamp bit support, but this
449     // was introduced at the same time as 16-bit operations.
450     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
451       .legalFor({S32, S16}) // Clamp modifier
452       .minScalar(0, S16)
453       .scalarize(0)
454       .widenScalarToNextPow2(0, 16)
455       .lower();
456 
457     // We're just lowering this, but it helps get a better result to try to
458     // coerce to the desired type first.
459     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
460       .minScalar(0, S16)
461       .scalarize(0)
462       .lower();
463   } else {
464     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
465       .legalFor({S32})
466       .clampScalar(0, S32, S32)
467       .scalarize(0);
468 
469     if (ST.hasIntClamp()) {
470       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
471         .legalFor({S32}) // Clamp modifier.
472         .scalarize(0)
473         .minScalarOrElt(0, S32)
474         .lower();
475     } else {
476       // Clamp bit support was added in VI, along with 16-bit operations.
477       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
478         .minScalar(0, S32)
479         .scalarize(0)
480         .lower();
481     }
482 
483     // FIXME: DAG expansion gets better results. The widening uses the smaller
484     // range values and goes for the min/max lowering directly.
485     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
486       .minScalar(0, S32)
487       .scalarize(0)
488       .lower();
489   }
490 
491   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
492     .customFor({S32, S64})
493     .clampScalar(0, S32, S64)
494     .widenScalarToNextPow2(0, 32)
495     .scalarize(0);
496 
497   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
498     .legalFor({S32})
499     .clampScalar(0, S32, S32)
500     .scalarize(0);
501 
502   // Report legal for any types we can handle anywhere. For the cases only legal
503   // on the SALU, RegBankSelect will be able to re-legalize.
504   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
505     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
506     .clampScalar(0, S32, S64)
507     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
508     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
509     .widenScalarToNextPow2(0)
510     .scalarize(0);
511 
512   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
513                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
514     .legalFor({{S32, S1}, {S32, S32}})
515     .minScalar(0, S32)
516     // TODO: .scalarize(0)
517     .lower();
518 
519   getActionDefinitionsBuilder(G_BITCAST)
520     // Don't worry about the size constraint.
521     .legalIf(all(isRegisterType(0), isRegisterType(1)))
522     .lower();
523 
524 
525   getActionDefinitionsBuilder(G_CONSTANT)
526     .legalFor({S1, S32, S64, S16, GlobalPtr,
527                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
528     .legalIf(isPointer(0))
529     .clampScalar(0, S32, S64)
530     .widenScalarToNextPow2(0);
531 
532   getActionDefinitionsBuilder(G_FCONSTANT)
533     .legalFor({S32, S64, S16})
534     .clampScalar(0, S16, S64);
535 
536   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
537       .legalIf(isRegisterType(0))
538       // s1 and s16 are special cases because they have legal operations on
539       // them, but don't really occupy registers in the normal way.
540       .legalFor({S1, S16})
541       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
542       .clampScalarOrElt(0, S32, MaxScalar)
543       .widenScalarToNextPow2(0, 32)
544       .clampMaxNumElements(0, S32, 16);
545 
546   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
547 
548   // If the amount is divergent, we have to do a wave reduction to get the
549   // maximum value, so this is expanded during RegBankSelect.
550   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
551     .legalFor({{PrivatePtr, S32}});
552 
553   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
554     .customIf(typeIsNot(0, PrivatePtr));
555 
556   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
557 
558   auto &FPOpActions = getActionDefinitionsBuilder(
559     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
560     .legalFor({S32, S64});
561   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
562     .customFor({S32, S64});
563   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
564     .customFor({S32, S64});
565 
566   if (ST.has16BitInsts()) {
567     if (ST.hasVOP3PInsts())
568       FPOpActions.legalFor({S16, V2S16});
569     else
570       FPOpActions.legalFor({S16});
571 
572     TrigActions.customFor({S16});
573     FDIVActions.customFor({S16});
574   }
575 
576   auto &MinNumMaxNum = getActionDefinitionsBuilder({
577       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
578 
579   if (ST.hasVOP3PInsts()) {
580     MinNumMaxNum.customFor(FPTypesPK16)
581       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
582       .clampMaxNumElements(0, S16, 2)
583       .clampScalar(0, S16, S64)
584       .scalarize(0);
585   } else if (ST.has16BitInsts()) {
586     MinNumMaxNum.customFor(FPTypes16)
587       .clampScalar(0, S16, S64)
588       .scalarize(0);
589   } else {
590     MinNumMaxNum.customFor(FPTypesBase)
591       .clampScalar(0, S32, S64)
592       .scalarize(0);
593   }
594 
595   if (ST.hasVOP3PInsts())
596     FPOpActions.clampMaxNumElements(0, S16, 2);
597 
598   FPOpActions
599     .scalarize(0)
600     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
601 
602   TrigActions
603     .scalarize(0)
604     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
605 
606   FDIVActions
607     .scalarize(0)
608     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
609 
610   getActionDefinitionsBuilder({G_FNEG, G_FABS})
611     .legalFor(FPTypesPK16)
612     .clampMaxNumElements(0, S16, 2)
613     .scalarize(0)
614     .clampScalar(0, S16, S64);
615 
616   if (ST.has16BitInsts()) {
617     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
618       .legalFor({S32, S64, S16})
619       .scalarize(0)
620       .clampScalar(0, S16, S64);
621   } else {
622     getActionDefinitionsBuilder(G_FSQRT)
623       .legalFor({S32, S64})
624       .scalarize(0)
625       .clampScalar(0, S32, S64);
626 
627     if (ST.hasFractBug()) {
628       getActionDefinitionsBuilder(G_FFLOOR)
629         .customFor({S64})
630         .legalFor({S32, S64})
631         .scalarize(0)
632         .clampScalar(0, S32, S64);
633     } else {
634       getActionDefinitionsBuilder(G_FFLOOR)
635         .legalFor({S32, S64})
636         .scalarize(0)
637         .clampScalar(0, S32, S64);
638     }
639   }
640 
641   getActionDefinitionsBuilder(G_FPTRUNC)
642     .legalFor({{S32, S64}, {S16, S32}})
643     .scalarize(0)
644     .lower();
645 
646   getActionDefinitionsBuilder(G_FPEXT)
647     .legalFor({{S64, S32}, {S32, S16}})
648     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
649     .scalarize(0);
650 
651   getActionDefinitionsBuilder(G_FSUB)
652       // Use actual fsub instruction
653       .legalFor({S32})
654       // Must use fadd + fneg
655       .lowerFor({S64, S16, V2S16})
656       .scalarize(0)
657       .clampScalar(0, S32, S64);
658 
659   // Whether this is legal depends on the floating point mode for the function.
660   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
661   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
662     FMad.customFor({S32, S16});
663   else if (ST.hasMadMacF32Insts())
664     FMad.customFor({S32});
665   else if (ST.hasMadF16())
666     FMad.customFor({S16});
667   FMad.scalarize(0)
668       .lower();
669 
670   // TODO: Do we need to clamp maximum bitwidth?
671   getActionDefinitionsBuilder(G_TRUNC)
672     .legalIf(isScalar(0))
673     .legalFor({{V2S16, V2S32}})
674     .clampMaxNumElements(0, S16, 2)
675     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
676     // situations (like an invalid implicit use), we don't want to infinite loop
677     // in the legalizer.
678     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
679     .alwaysLegal();
680 
681   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
682     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
683                {S32, S1}, {S64, S1}, {S16, S1}})
684     .scalarize(0)
685     .clampScalar(0, S32, S64)
686     .widenScalarToNextPow2(1, 32);
687 
688   // TODO: Split s1->s64 during regbankselect for VALU.
689   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
690     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
691     .lowerFor({{S32, S64}})
692     .lowerIf(typeIs(1, S1))
693     .customFor({{S64, S64}});
694   if (ST.has16BitInsts())
695     IToFP.legalFor({{S16, S16}});
696   IToFP.clampScalar(1, S32, S64)
697        .minScalar(0, S32)
698        .scalarize(0)
699        .widenScalarToNextPow2(1);
700 
701   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
702     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
703     .customFor({{S64, S64}})
704     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
705   if (ST.has16BitInsts())
706     FPToI.legalFor({{S16, S16}});
707   else
708     FPToI.minScalar(1, S32);
709 
710   FPToI.minScalar(0, S32)
711        .scalarize(0)
712        .lower();
713 
714   // Lower roundeven into G_FRINT
715   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
716     .scalarize(0)
717     .lower();
718 
719   if (ST.has16BitInsts()) {
720     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
721       .legalFor({S16, S32, S64})
722       .clampScalar(0, S16, S64)
723       .scalarize(0);
724   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
725     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
726       .legalFor({S32, S64})
727       .clampScalar(0, S32, S64)
728       .scalarize(0);
729   } else {
730     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
731       .legalFor({S32})
732       .customFor({S64})
733       .clampScalar(0, S32, S64)
734       .scalarize(0);
735   }
736 
737   getActionDefinitionsBuilder(G_PTR_ADD)
738     .legalIf(all(isPointer(0), sameSize(0, 1)))
739     .scalarize(0)
740     .scalarSameSizeAs(1, 0);
741 
742   getActionDefinitionsBuilder(G_PTRMASK)
743     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
744     .scalarSameSizeAs(1, 0)
745     .scalarize(0);
746 
747   auto &CmpBuilder =
748     getActionDefinitionsBuilder(G_ICMP)
749     // The compare output type differs based on the register bank of the output,
750     // so make both s1 and s32 legal.
751     //
752     // Scalar compares producing output in scc will be promoted to s32, as that
753     // is the allocatable register type that will be needed for the copy from
754     // scc. This will be promoted during RegBankSelect, and we assume something
755     // before that won't try to use s32 result types.
756     //
757     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
758     // bank.
759     .legalForCartesianProduct(
760       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
761     .legalForCartesianProduct(
762       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
763   if (ST.has16BitInsts()) {
764     CmpBuilder.legalFor({{S1, S16}});
765   }
766 
767   CmpBuilder
768     .widenScalarToNextPow2(1)
769     .clampScalar(1, S32, S64)
770     .scalarize(0)
771     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
772 
773   getActionDefinitionsBuilder(G_FCMP)
774     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
775     .widenScalarToNextPow2(1)
776     .clampScalar(1, S32, S64)
777     .scalarize(0);
778 
779   // FIXME: fpow has a selection pattern that should move to custom lowering.
780   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
781   if (ST.has16BitInsts())
782     Exp2Ops.legalFor({S32, S16});
783   else
784     Exp2Ops.legalFor({S32});
785   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
786   Exp2Ops.scalarize(0);
787 
788   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
789   if (ST.has16BitInsts())
790     ExpOps.customFor({{S32}, {S16}});
791   else
792     ExpOps.customFor({S32});
793   ExpOps.clampScalar(0, MinScalarFPTy, S32)
794         .scalarize(0);
795 
796   getActionDefinitionsBuilder(G_FPOWI)
797     .clampScalar(0, MinScalarFPTy, S32)
798     .lower();
799 
800   // The 64-bit versions produce 32-bit results, but only on the SALU.
801   getActionDefinitionsBuilder(G_CTPOP)
802     .legalFor({{S32, S32}, {S32, S64}})
803     .clampScalar(0, S32, S32)
804     .clampScalar(1, S32, S64)
805     .scalarize(0)
806     .widenScalarToNextPow2(0, 32)
807     .widenScalarToNextPow2(1, 32);
808 
809   // The hardware instructions return a different result on 0 than the generic
810   // instructions expect. The hardware produces -1, but these produce the
811   // bitwidth.
812   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
813     .scalarize(0)
814     .clampScalar(0, S32, S32)
815     .clampScalar(1, S32, S64)
816     .widenScalarToNextPow2(0, 32)
817     .widenScalarToNextPow2(1, 32)
818     .lower();
819 
820   // The 64-bit versions produce 32-bit results, but only on the SALU.
821   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
822     .legalFor({{S32, S32}, {S32, S64}})
823     .clampScalar(0, S32, S32)
824     .clampScalar(1, S32, S64)
825     .scalarize(0)
826     .widenScalarToNextPow2(0, 32)
827     .widenScalarToNextPow2(1, 32);
828 
829   getActionDefinitionsBuilder(G_BITREVERSE)
830     .legalFor({S32})
831     .clampScalar(0, S32, S32)
832     .scalarize(0);
833 
834   if (ST.has16BitInsts()) {
835     getActionDefinitionsBuilder(G_BSWAP)
836       .legalFor({S16, S32, V2S16})
837       .clampMaxNumElements(0, S16, 2)
838       // FIXME: Fixing non-power-of-2 before clamp is workaround for
839       // narrowScalar limitation.
840       .widenScalarToNextPow2(0)
841       .clampScalar(0, S16, S32)
842       .scalarize(0);
843 
844     if (ST.hasVOP3PInsts()) {
845       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
846         .legalFor({S32, S16, V2S16})
847         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
848         .clampMaxNumElements(0, S16, 2)
849         .minScalar(0, S16)
850         .widenScalarToNextPow2(0)
851         .scalarize(0)
852         .lower();
853     } else {
854       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
855         .legalFor({S32, S16})
856         .widenScalarToNextPow2(0)
857         .minScalar(0, S16)
858         .scalarize(0)
859         .lower();
860     }
861   } else {
862     // TODO: Should have same legality without v_perm_b32
863     getActionDefinitionsBuilder(G_BSWAP)
864       .legalFor({S32})
865       .lowerIf(scalarNarrowerThan(0, 32))
866       // FIXME: Fixing non-power-of-2 before clamp is workaround for
867       // narrowScalar limitation.
868       .widenScalarToNextPow2(0)
869       .maxScalar(0, S32)
870       .scalarize(0)
871       .lower();
872 
873     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
874       .legalFor({S32})
875       .minScalar(0, S32)
876       .widenScalarToNextPow2(0)
877       .scalarize(0)
878       .lower();
879   }
880 
881   getActionDefinitionsBuilder(G_INTTOPTR)
882     // List the common cases
883     .legalForCartesianProduct(AddrSpaces64, {S64})
884     .legalForCartesianProduct(AddrSpaces32, {S32})
885     .scalarize(0)
886     // Accept any address space as long as the size matches
887     .legalIf(sameSize(0, 1))
888     .widenScalarIf(smallerThan(1, 0),
889       [](const LegalityQuery &Query) {
890         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
891       })
892     .narrowScalarIf(largerThan(1, 0),
893       [](const LegalityQuery &Query) {
894         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
895       });
896 
897   getActionDefinitionsBuilder(G_PTRTOINT)
898     // List the common cases
899     .legalForCartesianProduct(AddrSpaces64, {S64})
900     .legalForCartesianProduct(AddrSpaces32, {S32})
901     .scalarize(0)
902     // Accept any address space as long as the size matches
903     .legalIf(sameSize(0, 1))
904     .widenScalarIf(smallerThan(0, 1),
905       [](const LegalityQuery &Query) {
906         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
907       })
908     .narrowScalarIf(
909       largerThan(0, 1),
910       [](const LegalityQuery &Query) {
911         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
912       });
913 
914   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
915     .scalarize(0)
916     .custom();
917 
918   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
919                                     bool IsLoad) -> bool {
920     const LLT DstTy = Query.Types[0];
921 
922     // Split vector extloads.
923     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
924     unsigned Align = Query.MMODescrs[0].AlignInBits;
925 
926     if (MemSize < DstTy.getSizeInBits())
927       MemSize = std::max(MemSize, Align);
928 
929     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
930       return true;
931 
932     const LLT PtrTy = Query.Types[1];
933     unsigned AS = PtrTy.getAddressSpace();
934     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
935       return true;
936 
937     // Catch weird sized loads that don't evenly divide into the access sizes
938     // TODO: May be able to widen depending on alignment etc.
939     unsigned NumRegs = (MemSize + 31) / 32;
940     if (NumRegs == 3) {
941       if (!ST.hasDwordx3LoadStores())
942         return true;
943     } else {
944       // If the alignment allows, these should have been widened.
945       if (!isPowerOf2_32(NumRegs))
946         return true;
947     }
948 
949     if (Align < MemSize) {
950       const SITargetLowering *TLI = ST.getTargetLowering();
951       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
952     }
953 
954     return false;
955   };
956 
957   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
958                                          unsigned Opc) -> bool {
959     unsigned Size = Query.Types[0].getSizeInBits();
960     if (isPowerOf2_32(Size))
961       return false;
962 
963     if (Size == 96 && ST.hasDwordx3LoadStores())
964       return false;
965 
966     unsigned AddrSpace = Query.Types[1].getAddressSpace();
967     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
968       return false;
969 
970     unsigned Align = Query.MMODescrs[0].AlignInBits;
971     unsigned RoundedSize = NextPowerOf2(Size);
972     return (Align >= RoundedSize);
973   };
974 
975   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
976   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
977   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
978 
979   // TODO: Refine based on subtargets which support unaligned access or 128-bit
980   // LDS
981   // TODO: Unsupported flat for SI.
982 
983   for (unsigned Op : {G_LOAD, G_STORE}) {
984     const bool IsStore = Op == G_STORE;
985 
986     auto &Actions = getActionDefinitionsBuilder(Op);
987     // Explicitly list some common cases.
988     // TODO: Does this help compile time at all?
989     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
990                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
991                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
992                                       {S64, GlobalPtr, 64, GlobalAlign32},
993                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
994                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
995                                       {S32, GlobalPtr, 8, GlobalAlign8},
996                                       {S32, GlobalPtr, 16, GlobalAlign16},
997 
998                                       {S32, LocalPtr, 32, 32},
999                                       {S64, LocalPtr, 64, 32},
1000                                       {V2S32, LocalPtr, 64, 32},
1001                                       {S32, LocalPtr, 8, 8},
1002                                       {S32, LocalPtr, 16, 16},
1003                                       {V2S16, LocalPtr, 32, 32},
1004 
1005                                       {S32, PrivatePtr, 32, 32},
1006                                       {S32, PrivatePtr, 8, 8},
1007                                       {S32, PrivatePtr, 16, 16},
1008                                       {V2S16, PrivatePtr, 32, 32},
1009 
1010                                       {S32, ConstantPtr, 32, GlobalAlign32},
1011                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1012                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1013                                       {S64, ConstantPtr, 64, GlobalAlign32},
1014                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1015     Actions.legalIf(
1016       [=](const LegalityQuery &Query) -> bool {
1017         return isLoadStoreLegal(ST, Query, Op);
1018       });
1019 
1020     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1021     // 64-bits.
1022     //
1023     // TODO: Should generalize bitcast action into coerce, which will also cover
1024     // inserting addrspacecasts.
1025     Actions.customIf(typeIs(1, Constant32Ptr));
1026 
1027     // Turn any illegal element vectors into something easier to deal
1028     // with. These will ultimately produce 32-bit scalar shifts to extract the
1029     // parts anyway.
1030     //
1031     // For odd 16-bit element vectors, prefer to split those into pieces with
1032     // 16-bit vector parts.
1033     Actions.bitcastIf(
1034       [=](const LegalityQuery &Query) -> bool {
1035         const LLT Ty = Query.Types[0];
1036         const unsigned Size = Ty.getSizeInBits();
1037 
1038         if (Size != Query.MMODescrs[0].SizeInBits)
1039           return Size <= 32 && Ty.isVector();
1040 
1041         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
1042           return true;
1043         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
1044                !isRegisterVectorElementType(Ty.getElementType());
1045       }, bitcastToRegisterType(0));
1046 
1047     Actions
1048         .customIf(typeIs(1, Constant32Ptr))
1049         // Widen suitably aligned loads by loading extra elements.
1050         .moreElementsIf([=](const LegalityQuery &Query) {
1051             const LLT Ty = Query.Types[0];
1052             return Op == G_LOAD && Ty.isVector() &&
1053                    shouldWidenLoadResult(Query, Op);
1054           }, moreElementsToNextPow2(0))
1055         .widenScalarIf([=](const LegalityQuery &Query) {
1056             const LLT Ty = Query.Types[0];
1057             return Op == G_LOAD && !Ty.isVector() &&
1058                    shouldWidenLoadResult(Query, Op);
1059           }, widenScalarOrEltToNextPow2(0))
1060         .narrowScalarIf(
1061             [=](const LegalityQuery &Query) -> bool {
1062               return !Query.Types[0].isVector() &&
1063                      needToSplitMemOp(Query, Op == G_LOAD);
1064             },
1065             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1066               const LLT DstTy = Query.Types[0];
1067               const LLT PtrTy = Query.Types[1];
1068 
1069               const unsigned DstSize = DstTy.getSizeInBits();
1070               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1071 
1072               // Split extloads.
1073               if (DstSize > MemSize)
1074                 return std::make_pair(0, LLT::scalar(MemSize));
1075 
1076               if (!isPowerOf2_32(DstSize)) {
1077                 // We're probably decomposing an odd sized store. Try to split
1078                 // to the widest type. TODO: Account for alignment. As-is it
1079                 // should be OK, since the new parts will be further legalized.
1080                 unsigned FloorSize = PowerOf2Floor(DstSize);
1081                 return std::make_pair(0, LLT::scalar(FloorSize));
1082               }
1083 
1084               if (DstSize > 32 && (DstSize % 32 != 0)) {
1085                 // FIXME: Need a way to specify non-extload of larger size if
1086                 // suitably aligned.
1087                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1088               }
1089 
1090               unsigned MaxSize = maxSizeForAddrSpace(ST,
1091                                                      PtrTy.getAddressSpace(),
1092                                                      Op == G_LOAD);
1093               if (MemSize > MaxSize)
1094                 return std::make_pair(0, LLT::scalar(MaxSize));
1095 
1096               unsigned Align = Query.MMODescrs[0].AlignInBits;
1097               return std::make_pair(0, LLT::scalar(Align));
1098             })
1099         .fewerElementsIf(
1100             [=](const LegalityQuery &Query) -> bool {
1101               return Query.Types[0].isVector() &&
1102                      needToSplitMemOp(Query, Op == G_LOAD);
1103             },
1104             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1105               const LLT DstTy = Query.Types[0];
1106               const LLT PtrTy = Query.Types[1];
1107 
1108               LLT EltTy = DstTy.getElementType();
1109               unsigned MaxSize = maxSizeForAddrSpace(ST,
1110                                                      PtrTy.getAddressSpace(),
1111                                                      Op == G_LOAD);
1112 
1113               // FIXME: Handle widened to power of 2 results better. This ends
1114               // up scalarizing.
1115               // FIXME: 3 element stores scalarized on SI
1116 
1117               // Split if it's too large for the address space.
1118               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1119                 unsigned NumElts = DstTy.getNumElements();
1120                 unsigned EltSize = EltTy.getSizeInBits();
1121 
1122                 if (MaxSize % EltSize == 0) {
1123                   return std::make_pair(
1124                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1125                 }
1126 
1127                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1128 
1129                 // FIXME: Refine when odd breakdowns handled
1130                 // The scalars will need to be re-legalized.
1131                 if (NumPieces == 1 || NumPieces >= NumElts ||
1132                     NumElts % NumPieces != 0)
1133                   return std::make_pair(0, EltTy);
1134 
1135                 return std::make_pair(0,
1136                                       LLT::vector(NumElts / NumPieces, EltTy));
1137               }
1138 
1139               // FIXME: We could probably handle weird extending loads better.
1140               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1141               if (DstTy.getSizeInBits() > MemSize)
1142                 return std::make_pair(0, EltTy);
1143 
1144               unsigned EltSize = EltTy.getSizeInBits();
1145               unsigned DstSize = DstTy.getSizeInBits();
1146               if (!isPowerOf2_32(DstSize)) {
1147                 // We're probably decomposing an odd sized store. Try to split
1148                 // to the widest type. TODO: Account for alignment. As-is it
1149                 // should be OK, since the new parts will be further legalized.
1150                 unsigned FloorSize = PowerOf2Floor(DstSize);
1151                 return std::make_pair(
1152                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1153               }
1154 
1155               // Need to split because of alignment.
1156               unsigned Align = Query.MMODescrs[0].AlignInBits;
1157               if (EltSize > Align &&
1158                   (EltSize / Align < DstTy.getNumElements())) {
1159                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1160               }
1161 
1162               // May need relegalization for the scalars.
1163               return std::make_pair(0, EltTy);
1164             })
1165         .minScalar(0, S32);
1166 
1167     if (IsStore)
1168       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1169 
1170     // TODO: Need a bitcast lower option?
1171     Actions
1172         .widenScalarToNextPow2(0)
1173         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1174   }
1175 
1176   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1177                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1178                                                   {S32, GlobalPtr, 16, 2 * 8},
1179                                                   {S32, LocalPtr, 8, 8},
1180                                                   {S32, LocalPtr, 16, 16},
1181                                                   {S32, PrivatePtr, 8, 8},
1182                                                   {S32, PrivatePtr, 16, 16},
1183                                                   {S32, ConstantPtr, 8, 8},
1184                                                   {S32, ConstantPtr, 16, 2 * 8}});
1185   if (ST.hasFlatAddressSpace()) {
1186     ExtLoads.legalForTypesWithMemDesc(
1187         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1188   }
1189 
1190   ExtLoads.clampScalar(0, S32, S32)
1191           .widenScalarToNextPow2(0)
1192           .unsupportedIfMemSizeNotPow2()
1193           .lower();
1194 
1195   auto &Atomics = getActionDefinitionsBuilder(
1196     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1197      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1198      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1199      G_ATOMICRMW_UMIN})
1200     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1201                {S64, GlobalPtr}, {S64, LocalPtr},
1202                {S32, RegionPtr}, {S64, RegionPtr}});
1203   if (ST.hasFlatAddressSpace()) {
1204     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1205   }
1206 
1207   if (ST.hasLDSFPAtomics()) {
1208     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1209       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1210   }
1211 
1212   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1213   // demarshalling
1214   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1215     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1216                 {S32, FlatPtr}, {S64, FlatPtr}})
1217     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1218                {S32, RegionPtr}, {S64, RegionPtr}});
1219   // TODO: Pointer types, any 32-bit or 64-bit vector
1220 
1221   // Condition should be s32 for scalar, s1 for vector.
1222   getActionDefinitionsBuilder(G_SELECT)
1223     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1224           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1225           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1226     .clampScalar(0, S16, S64)
1227     .scalarize(1)
1228     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1229     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1230     .clampMaxNumElements(0, S32, 2)
1231     .clampMaxNumElements(0, LocalPtr, 2)
1232     .clampMaxNumElements(0, PrivatePtr, 2)
1233     .scalarize(0)
1234     .widenScalarToNextPow2(0)
1235     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1236 
1237   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1238   // be more flexible with the shift amount type.
1239   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1240     .legalFor({{S32, S32}, {S64, S32}});
1241   if (ST.has16BitInsts()) {
1242     if (ST.hasVOP3PInsts()) {
1243       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1244             .clampMaxNumElements(0, S16, 2);
1245     } else
1246       Shifts.legalFor({{S16, S16}});
1247 
1248     // TODO: Support 16-bit shift amounts for all types
1249     Shifts.widenScalarIf(
1250       [=](const LegalityQuery &Query) {
1251         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1252         // 32-bit amount.
1253         const LLT ValTy = Query.Types[0];
1254         const LLT AmountTy = Query.Types[1];
1255         return ValTy.getSizeInBits() <= 16 &&
1256                AmountTy.getSizeInBits() < 16;
1257       }, changeTo(1, S16));
1258     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1259     Shifts.clampScalar(1, S32, S32);
1260     Shifts.clampScalar(0, S16, S64);
1261     Shifts.widenScalarToNextPow2(0, 16);
1262   } else {
1263     // Make sure we legalize the shift amount type first, as the general
1264     // expansion for the shifted type will produce much worse code if it hasn't
1265     // been truncated already.
1266     Shifts.clampScalar(1, S32, S32);
1267     Shifts.clampScalar(0, S32, S64);
1268     Shifts.widenScalarToNextPow2(0, 32);
1269   }
1270   Shifts.scalarize(0);
1271 
1272   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1273     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1274     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1275     unsigned IdxTypeIdx = 2;
1276 
1277     getActionDefinitionsBuilder(Op)
1278       .customIf([=](const LegalityQuery &Query) {
1279           const LLT EltTy = Query.Types[EltTypeIdx];
1280           const LLT VecTy = Query.Types[VecTypeIdx];
1281           const LLT IdxTy = Query.Types[IdxTypeIdx];
1282           return (EltTy.getSizeInBits() == 16 ||
1283                   EltTy.getSizeInBits() % 32 == 0) &&
1284                  VecTy.getSizeInBits() % 32 == 0 &&
1285                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1286                  IdxTy.getSizeInBits() == 32;
1287         })
1288       .clampScalar(EltTypeIdx, S32, S64)
1289       .clampScalar(VecTypeIdx, S32, S64)
1290       .clampScalar(IdxTypeIdx, S32, S32)
1291       // TODO: Clamp the number of elements before resorting to stack lowering.
1292       // It should only be necessary with variable indexes.
1293       // As a last resort, lower to the stack
1294       .lower();
1295   }
1296 
1297   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1298     .unsupportedIf([=](const LegalityQuery &Query) {
1299         const LLT &EltTy = Query.Types[1].getElementType();
1300         return Query.Types[0] != EltTy;
1301       });
1302 
1303   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1304     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1305     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1306 
1307     // FIXME: Doesn't handle extract of illegal sizes.
1308     getActionDefinitionsBuilder(Op)
1309       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1310       // FIXME: Multiples of 16 should not be legal.
1311       .legalIf([=](const LegalityQuery &Query) {
1312           const LLT BigTy = Query.Types[BigTyIdx];
1313           const LLT LitTy = Query.Types[LitTyIdx];
1314           return (BigTy.getSizeInBits() % 32 == 0) &&
1315                  (LitTy.getSizeInBits() % 16 == 0);
1316         })
1317       .widenScalarIf(
1318         [=](const LegalityQuery &Query) {
1319           const LLT BigTy = Query.Types[BigTyIdx];
1320           return (BigTy.getScalarSizeInBits() < 16);
1321         },
1322         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1323       .widenScalarIf(
1324         [=](const LegalityQuery &Query) {
1325           const LLT LitTy = Query.Types[LitTyIdx];
1326           return (LitTy.getScalarSizeInBits() < 16);
1327         },
1328         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1329       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1330       .widenScalarToNextPow2(BigTyIdx, 32);
1331 
1332   }
1333 
1334   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1335     .legalForCartesianProduct(AllS32Vectors, {S32})
1336     .legalForCartesianProduct(AllS64Vectors, {S64})
1337     .clampNumElements(0, V16S32, V32S32)
1338     .clampNumElements(0, V2S64, V16S64)
1339     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1340 
1341   if (ST.hasScalarPackInsts()) {
1342     BuildVector
1343       // FIXME: Should probably widen s1 vectors straight to s32
1344       .minScalarOrElt(0, S16)
1345       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1346       .minScalar(1, S32);
1347 
1348     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1349       .legalFor({V2S16, S32})
1350       .lower();
1351     BuildVector.minScalarOrElt(0, S32);
1352   } else {
1353     BuildVector.customFor({V2S16, S16});
1354     BuildVector.minScalarOrElt(0, S32);
1355 
1356     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1357       .customFor({V2S16, S32})
1358       .lower();
1359   }
1360 
1361   BuildVector.legalIf(isRegisterType(0));
1362 
1363   // FIXME: Clamp maximum size
1364   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1365     .legalIf(isRegisterType(0));
1366 
1367   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1368   // pre-legalize.
1369   if (ST.hasVOP3PInsts()) {
1370     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1371       .customFor({V2S16, V2S16})
1372       .lower();
1373   } else
1374     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1375 
1376   // Merge/Unmerge
1377   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1378     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1379     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1380 
1381     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1382       const LLT Ty = Query.Types[TypeIdx];
1383       if (Ty.isVector()) {
1384         const LLT &EltTy = Ty.getElementType();
1385         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1386           return true;
1387         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1388           return true;
1389       }
1390       return false;
1391     };
1392 
1393     auto &Builder = getActionDefinitionsBuilder(Op)
1394       .lowerFor({{S16, V2S16}})
1395       .lowerIf([=](const LegalityQuery &Query) {
1396           const LLT BigTy = Query.Types[BigTyIdx];
1397           return BigTy.getSizeInBits() == 32;
1398         })
1399       // Try to widen to s16 first for small types.
1400       // TODO: Only do this on targets with legal s16 shifts
1401       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1402       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1403       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1404       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1405                            elementTypeIs(1, S16)),
1406                        changeTo(1, V2S16))
1407       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1408       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1409       // valid.
1410       .clampScalar(LitTyIdx, S32, S512)
1411       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1412       // Break up vectors with weird elements into scalars
1413       .fewerElementsIf(
1414         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1415         scalarize(0))
1416       .fewerElementsIf(
1417         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1418         scalarize(1))
1419       .clampScalar(BigTyIdx, S32, MaxScalar);
1420 
1421     if (Op == G_MERGE_VALUES) {
1422       Builder.widenScalarIf(
1423         // TODO: Use 16-bit shifts if legal for 8-bit values?
1424         [=](const LegalityQuery &Query) {
1425           const LLT Ty = Query.Types[LitTyIdx];
1426           return Ty.getSizeInBits() < 32;
1427         },
1428         changeTo(LitTyIdx, S32));
1429     }
1430 
1431     Builder.widenScalarIf(
1432       [=](const LegalityQuery &Query) {
1433         const LLT Ty = Query.Types[BigTyIdx];
1434         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1435           Ty.getSizeInBits() % 16 != 0;
1436       },
1437       [=](const LegalityQuery &Query) {
1438         // Pick the next power of 2, or a multiple of 64 over 128.
1439         // Whichever is smaller.
1440         const LLT &Ty = Query.Types[BigTyIdx];
1441         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1442         if (NewSizeInBits >= 256) {
1443           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1444           if (RoundedTo < NewSizeInBits)
1445             NewSizeInBits = RoundedTo;
1446         }
1447         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1448       })
1449       .legalIf([=](const LegalityQuery &Query) {
1450           const LLT &BigTy = Query.Types[BigTyIdx];
1451           const LLT &LitTy = Query.Types[LitTyIdx];
1452 
1453           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1454             return false;
1455           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1456             return false;
1457 
1458           return BigTy.getSizeInBits() % 16 == 0 &&
1459                  LitTy.getSizeInBits() % 16 == 0 &&
1460                  BigTy.getSizeInBits() <= MaxRegisterSize;
1461         })
1462       // Any vectors left are the wrong size. Scalarize them.
1463       .scalarize(0)
1464       .scalarize(1);
1465   }
1466 
1467   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1468   // RegBankSelect.
1469   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1470     .legalFor({{S32}, {S64}});
1471 
1472   if (ST.hasVOP3PInsts()) {
1473     SextInReg.lowerFor({{V2S16}})
1474       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1475       // get more vector shift opportunities, since we'll get those when
1476       // expanded.
1477       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1478   } else if (ST.has16BitInsts()) {
1479     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1480   } else {
1481     // Prefer to promote to s32 before lowering if we don't have 16-bit
1482     // shifts. This avoid a lot of intermediate truncate and extend operations.
1483     SextInReg.lowerFor({{S32}, {S64}});
1484   }
1485 
1486   SextInReg
1487     .scalarize(0)
1488     .clampScalar(0, S32, S64)
1489     .lower();
1490 
1491   getActionDefinitionsBuilder(G_FSHR)
1492     .legalFor({{S32, S32}})
1493     .scalarize(0)
1494     .lower();
1495 
1496   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1497     .legalFor({S64});
1498 
1499   getActionDefinitionsBuilder(G_FENCE)
1500     .alwaysLegal();
1501 
1502   getActionDefinitionsBuilder({
1503       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1504       G_FCOPYSIGN,
1505 
1506       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1507       G_ATOMICRMW_NAND,
1508       G_ATOMICRMW_FSUB,
1509       G_READ_REGISTER,
1510       G_WRITE_REGISTER,
1511 
1512       G_SADDO, G_SSUBO,
1513 
1514        // TODO: Implement
1515       G_FMINIMUM, G_FMAXIMUM,
1516       G_FSHL
1517     }).lower();
1518 
1519   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1520         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1521         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1522     .unsupported();
1523 
1524   computeTables();
1525   verify(*ST.getInstrInfo());
1526 }
1527 
1528 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1529                                          MachineInstr &MI) const {
1530   MachineIRBuilder &B = Helper.MIRBuilder;
1531   MachineRegisterInfo &MRI = *B.getMRI();
1532   GISelChangeObserver &Observer = Helper.Observer;
1533 
1534   switch (MI.getOpcode()) {
1535   case TargetOpcode::G_ADDRSPACE_CAST:
1536     return legalizeAddrSpaceCast(MI, MRI, B);
1537   case TargetOpcode::G_FRINT:
1538     return legalizeFrint(MI, MRI, B);
1539   case TargetOpcode::G_FCEIL:
1540     return legalizeFceil(MI, MRI, B);
1541   case TargetOpcode::G_INTRINSIC_TRUNC:
1542     return legalizeIntrinsicTrunc(MI, MRI, B);
1543   case TargetOpcode::G_SITOFP:
1544     return legalizeITOFP(MI, MRI, B, true);
1545   case TargetOpcode::G_UITOFP:
1546     return legalizeITOFP(MI, MRI, B, false);
1547   case TargetOpcode::G_FPTOSI:
1548     return legalizeFPTOI(MI, MRI, B, true);
1549   case TargetOpcode::G_FPTOUI:
1550     return legalizeFPTOI(MI, MRI, B, false);
1551   case TargetOpcode::G_FMINNUM:
1552   case TargetOpcode::G_FMAXNUM:
1553   case TargetOpcode::G_FMINNUM_IEEE:
1554   case TargetOpcode::G_FMAXNUM_IEEE:
1555     return legalizeMinNumMaxNum(Helper, MI);
1556   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1557     return legalizeExtractVectorElt(MI, MRI, B);
1558   case TargetOpcode::G_INSERT_VECTOR_ELT:
1559     return legalizeInsertVectorElt(MI, MRI, B);
1560   case TargetOpcode::G_SHUFFLE_VECTOR:
1561     return legalizeShuffleVector(MI, MRI, B);
1562   case TargetOpcode::G_FSIN:
1563   case TargetOpcode::G_FCOS:
1564     return legalizeSinCos(MI, MRI, B);
1565   case TargetOpcode::G_GLOBAL_VALUE:
1566     return legalizeGlobalValue(MI, MRI, B);
1567   case TargetOpcode::G_LOAD:
1568     return legalizeLoad(MI, MRI, B, Observer);
1569   case TargetOpcode::G_FMAD:
1570     return legalizeFMad(MI, MRI, B);
1571   case TargetOpcode::G_FDIV:
1572     return legalizeFDIV(MI, MRI, B);
1573   case TargetOpcode::G_UDIV:
1574   case TargetOpcode::G_UREM:
1575     return legalizeUDIV_UREM(MI, MRI, B);
1576   case TargetOpcode::G_SDIV:
1577   case TargetOpcode::G_SREM:
1578     return legalizeSDIV_SREM(MI, MRI, B);
1579   case TargetOpcode::G_ATOMIC_CMPXCHG:
1580     return legalizeAtomicCmpXChg(MI, MRI, B);
1581   case TargetOpcode::G_FLOG:
1582     return legalizeFlog(MI, B, numbers::ln2f);
1583   case TargetOpcode::G_FLOG10:
1584     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1585   case TargetOpcode::G_FEXP:
1586     return legalizeFExp(MI, B);
1587   case TargetOpcode::G_FPOW:
1588     return legalizeFPow(MI, B);
1589   case TargetOpcode::G_FFLOOR:
1590     return legalizeFFloor(MI, MRI, B);
1591   case TargetOpcode::G_BUILD_VECTOR:
1592     return legalizeBuildVector(MI, MRI, B);
1593   default:
1594     return false;
1595   }
1596 
1597   llvm_unreachable("expected switch to return");
1598 }
1599 
1600 Register AMDGPULegalizerInfo::getSegmentAperture(
1601   unsigned AS,
1602   MachineRegisterInfo &MRI,
1603   MachineIRBuilder &B) const {
1604   MachineFunction &MF = B.getMF();
1605   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1606   const LLT S32 = LLT::scalar(32);
1607 
1608   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1609 
1610   if (ST.hasApertureRegs()) {
1611     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1612     // getreg.
1613     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1614         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1615         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1616     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1617         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1618         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1619     unsigned Encoding =
1620         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1621         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1622         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1623 
1624     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1625 
1626     B.buildInstr(AMDGPU::S_GETREG_B32)
1627       .addDef(GetReg)
1628       .addImm(Encoding);
1629     MRI.setType(GetReg, S32);
1630 
1631     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1632     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1633   }
1634 
1635   Register QueuePtr = MRI.createGenericVirtualRegister(
1636     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1637 
1638   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1639     return Register();
1640 
1641   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1642   // private_segment_aperture_base_hi.
1643   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1644 
1645   // TODO: can we be smarter about machine pointer info?
1646   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1647   MachineMemOperand *MMO = MF.getMachineMemOperand(
1648       PtrInfo,
1649       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1650           MachineMemOperand::MOInvariant,
1651       4, commonAlignment(Align(64), StructOffset));
1652 
1653   Register LoadAddr;
1654 
1655   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1656   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1657 }
1658 
1659 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1660   MachineInstr &MI, MachineRegisterInfo &MRI,
1661   MachineIRBuilder &B) const {
1662   MachineFunction &MF = B.getMF();
1663 
1664   const LLT S32 = LLT::scalar(32);
1665   Register Dst = MI.getOperand(0).getReg();
1666   Register Src = MI.getOperand(1).getReg();
1667 
1668   LLT DstTy = MRI.getType(Dst);
1669   LLT SrcTy = MRI.getType(Src);
1670   unsigned DestAS = DstTy.getAddressSpace();
1671   unsigned SrcAS = SrcTy.getAddressSpace();
1672 
1673   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1674   // vector element.
1675   assert(!DstTy.isVector());
1676 
1677   const AMDGPUTargetMachine &TM
1678     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1679 
1680   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1681     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1682     return true;
1683   }
1684 
1685   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1686     // Truncate.
1687     B.buildExtract(Dst, Src, 0);
1688     MI.eraseFromParent();
1689     return true;
1690   }
1691 
1692   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1693     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1694     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1695 
1696     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1697     // another. Merge operands are required to be the same type, but creating an
1698     // extra ptrtoint would be kind of pointless.
1699     auto HighAddr = B.buildConstant(
1700       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1701     B.buildMerge(Dst, {Src, HighAddr});
1702     MI.eraseFromParent();
1703     return true;
1704   }
1705 
1706   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1707     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1708            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1709     unsigned NullVal = TM.getNullPointerValue(DestAS);
1710 
1711     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1712     auto FlatNull = B.buildConstant(SrcTy, 0);
1713 
1714     // Extract low 32-bits of the pointer.
1715     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1716 
1717     auto CmpRes =
1718         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1719     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1720 
1721     MI.eraseFromParent();
1722     return true;
1723   }
1724 
1725   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1726     return false;
1727 
1728   if (!ST.hasFlatAddressSpace())
1729     return false;
1730 
1731   auto SegmentNull =
1732       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1733   auto FlatNull =
1734       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1735 
1736   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1737   if (!ApertureReg.isValid())
1738     return false;
1739 
1740   auto CmpRes =
1741       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1742 
1743   // Coerce the type of the low half of the result so we can use merge_values.
1744   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1745 
1746   // TODO: Should we allow mismatched types but matching sizes in merges to
1747   // avoid the ptrtoint?
1748   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1749   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1750 
1751   MI.eraseFromParent();
1752   return true;
1753 }
1754 
1755 bool AMDGPULegalizerInfo::legalizeFrint(
1756   MachineInstr &MI, MachineRegisterInfo &MRI,
1757   MachineIRBuilder &B) const {
1758   Register Src = MI.getOperand(1).getReg();
1759   LLT Ty = MRI.getType(Src);
1760   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1761 
1762   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1763   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1764 
1765   auto C1 = B.buildFConstant(Ty, C1Val);
1766   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1767 
1768   // TODO: Should this propagate fast-math-flags?
1769   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1770   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1771 
1772   auto C2 = B.buildFConstant(Ty, C2Val);
1773   auto Fabs = B.buildFAbs(Ty, Src);
1774 
1775   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1776   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1777   MI.eraseFromParent();
1778   return true;
1779 }
1780 
1781 bool AMDGPULegalizerInfo::legalizeFceil(
1782   MachineInstr &MI, MachineRegisterInfo &MRI,
1783   MachineIRBuilder &B) const {
1784 
1785   const LLT S1 = LLT::scalar(1);
1786   const LLT S64 = LLT::scalar(64);
1787 
1788   Register Src = MI.getOperand(1).getReg();
1789   assert(MRI.getType(Src) == S64);
1790 
1791   // result = trunc(src)
1792   // if (src > 0.0 && src != result)
1793   //   result += 1.0
1794 
1795   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1796 
1797   const auto Zero = B.buildFConstant(S64, 0.0);
1798   const auto One = B.buildFConstant(S64, 1.0);
1799   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1800   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1801   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1802   auto Add = B.buildSelect(S64, And, One, Zero);
1803 
1804   // TODO: Should this propagate fast-math-flags?
1805   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1806   return true;
1807 }
1808 
1809 static MachineInstrBuilder extractF64Exponent(Register Hi,
1810                                               MachineIRBuilder &B) {
1811   const unsigned FractBits = 52;
1812   const unsigned ExpBits = 11;
1813   LLT S32 = LLT::scalar(32);
1814 
1815   auto Const0 = B.buildConstant(S32, FractBits - 32);
1816   auto Const1 = B.buildConstant(S32, ExpBits);
1817 
1818   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1819     .addUse(Hi)
1820     .addUse(Const0.getReg(0))
1821     .addUse(Const1.getReg(0));
1822 
1823   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1824 }
1825 
1826 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1827   MachineInstr &MI, MachineRegisterInfo &MRI,
1828   MachineIRBuilder &B) const {
1829   const LLT S1 = LLT::scalar(1);
1830   const LLT S32 = LLT::scalar(32);
1831   const LLT S64 = LLT::scalar(64);
1832 
1833   Register Src = MI.getOperand(1).getReg();
1834   assert(MRI.getType(Src) == S64);
1835 
1836   // TODO: Should this use extract since the low half is unused?
1837   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1838   Register Hi = Unmerge.getReg(1);
1839 
1840   // Extract the upper half, since this is where we will find the sign and
1841   // exponent.
1842   auto Exp = extractF64Exponent(Hi, B);
1843 
1844   const unsigned FractBits = 52;
1845 
1846   // Extract the sign bit.
1847   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1848   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1849 
1850   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1851 
1852   const auto Zero32 = B.buildConstant(S32, 0);
1853 
1854   // Extend back to 64-bits.
1855   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1856 
1857   auto Shr = B.buildAShr(S64, FractMask, Exp);
1858   auto Not = B.buildNot(S64, Shr);
1859   auto Tmp0 = B.buildAnd(S64, Src, Not);
1860   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1861 
1862   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1863   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1864 
1865   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1866   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1867   MI.eraseFromParent();
1868   return true;
1869 }
1870 
1871 bool AMDGPULegalizerInfo::legalizeITOFP(
1872   MachineInstr &MI, MachineRegisterInfo &MRI,
1873   MachineIRBuilder &B, bool Signed) const {
1874 
1875   Register Dst = MI.getOperand(0).getReg();
1876   Register Src = MI.getOperand(1).getReg();
1877 
1878   const LLT S64 = LLT::scalar(64);
1879   const LLT S32 = LLT::scalar(32);
1880 
1881   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1882 
1883   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1884 
1885   auto CvtHi = Signed ?
1886     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1887     B.buildUITOFP(S64, Unmerge.getReg(1));
1888 
1889   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1890 
1891   auto ThirtyTwo = B.buildConstant(S32, 32);
1892   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1893     .addUse(CvtHi.getReg(0))
1894     .addUse(ThirtyTwo.getReg(0));
1895 
1896   // TODO: Should this propagate fast-math-flags?
1897   B.buildFAdd(Dst, LdExp, CvtLo);
1898   MI.eraseFromParent();
1899   return true;
1900 }
1901 
1902 // TODO: Copied from DAG implementation. Verify logic and document how this
1903 // actually works.
1904 bool AMDGPULegalizerInfo::legalizeFPTOI(
1905   MachineInstr &MI, MachineRegisterInfo &MRI,
1906   MachineIRBuilder &B, bool Signed) const {
1907 
1908   Register Dst = MI.getOperand(0).getReg();
1909   Register Src = MI.getOperand(1).getReg();
1910 
1911   const LLT S64 = LLT::scalar(64);
1912   const LLT S32 = LLT::scalar(32);
1913 
1914   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1915 
1916   unsigned Flags = MI.getFlags();
1917 
1918   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1919   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1920   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1921 
1922   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1923   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1924   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1925 
1926   auto Hi = Signed ?
1927     B.buildFPTOSI(S32, FloorMul) :
1928     B.buildFPTOUI(S32, FloorMul);
1929   auto Lo = B.buildFPTOUI(S32, Fma);
1930 
1931   B.buildMerge(Dst, { Lo, Hi });
1932   MI.eraseFromParent();
1933 
1934   return true;
1935 }
1936 
1937 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1938                                                MachineInstr &MI) const {
1939   MachineFunction &MF = Helper.MIRBuilder.getMF();
1940   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1941 
1942   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1943                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1944 
1945   // With ieee_mode disabled, the instructions have the correct behavior
1946   // already for G_FMINNUM/G_FMAXNUM
1947   if (!MFI->getMode().IEEE)
1948     return !IsIEEEOp;
1949 
1950   if (IsIEEEOp)
1951     return true;
1952 
1953   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1954 }
1955 
1956 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1957   MachineInstr &MI, MachineRegisterInfo &MRI,
1958   MachineIRBuilder &B) const {
1959   // TODO: Should move some of this into LegalizerHelper.
1960 
1961   // TODO: Promote dynamic indexing of s16 to s32
1962 
1963   // FIXME: Artifact combiner probably should have replaced the truncated
1964   // constant before this, so we shouldn't need
1965   // getConstantVRegValWithLookThrough.
1966   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1967     MI.getOperand(2).getReg(), MRI);
1968   if (!IdxVal) // Dynamic case will be selected to register indexing.
1969     return true;
1970 
1971   Register Dst = MI.getOperand(0).getReg();
1972   Register Vec = MI.getOperand(1).getReg();
1973 
1974   LLT VecTy = MRI.getType(Vec);
1975   LLT EltTy = VecTy.getElementType();
1976   assert(EltTy == MRI.getType(Dst));
1977 
1978   if (IdxVal->Value < VecTy.getNumElements())
1979     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1980   else
1981     B.buildUndef(Dst);
1982 
1983   MI.eraseFromParent();
1984   return true;
1985 }
1986 
1987 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1988   MachineInstr &MI, MachineRegisterInfo &MRI,
1989   MachineIRBuilder &B) const {
1990   // TODO: Should move some of this into LegalizerHelper.
1991 
1992   // TODO: Promote dynamic indexing of s16 to s32
1993 
1994   // FIXME: Artifact combiner probably should have replaced the truncated
1995   // constant before this, so we shouldn't need
1996   // getConstantVRegValWithLookThrough.
1997   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1998     MI.getOperand(3).getReg(), MRI);
1999   if (!IdxVal) // Dynamic case will be selected to register indexing.
2000     return true;
2001 
2002   Register Dst = MI.getOperand(0).getReg();
2003   Register Vec = MI.getOperand(1).getReg();
2004   Register Ins = MI.getOperand(2).getReg();
2005 
2006   LLT VecTy = MRI.getType(Vec);
2007   LLT EltTy = VecTy.getElementType();
2008   assert(EltTy == MRI.getType(Ins));
2009 
2010   if (IdxVal->Value < VecTy.getNumElements())
2011     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2012   else
2013     B.buildUndef(Dst);
2014 
2015   MI.eraseFromParent();
2016   return true;
2017 }
2018 
2019 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2020   MachineInstr &MI, MachineRegisterInfo &MRI,
2021   MachineIRBuilder &B) const {
2022   const LLT V2S16 = LLT::vector(2, 16);
2023 
2024   Register Dst = MI.getOperand(0).getReg();
2025   Register Src0 = MI.getOperand(1).getReg();
2026   LLT DstTy = MRI.getType(Dst);
2027   LLT SrcTy = MRI.getType(Src0);
2028 
2029   if (SrcTy == V2S16 && DstTy == V2S16 &&
2030       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2031     return true;
2032 
2033   MachineIRBuilder HelperBuilder(MI);
2034   GISelObserverWrapper DummyObserver;
2035   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2036   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2037 }
2038 
2039 bool AMDGPULegalizerInfo::legalizeSinCos(
2040   MachineInstr &MI, MachineRegisterInfo &MRI,
2041   MachineIRBuilder &B) const {
2042 
2043   Register DstReg = MI.getOperand(0).getReg();
2044   Register SrcReg = MI.getOperand(1).getReg();
2045   LLT Ty = MRI.getType(DstReg);
2046   unsigned Flags = MI.getFlags();
2047 
2048   Register TrigVal;
2049   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2050   if (ST.hasTrigReducedRange()) {
2051     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2052     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2053       .addUse(MulVal.getReg(0))
2054       .setMIFlags(Flags).getReg(0);
2055   } else
2056     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2057 
2058   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2059     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2060   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2061     .addUse(TrigVal)
2062     .setMIFlags(Flags);
2063   MI.eraseFromParent();
2064   return true;
2065 }
2066 
2067 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2068                                                   MachineIRBuilder &B,
2069                                                   const GlobalValue *GV,
2070                                                   int64_t Offset,
2071                                                   unsigned GAFlags) const {
2072   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2073   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2074   // to the following code sequence:
2075   //
2076   // For constant address space:
2077   //   s_getpc_b64 s[0:1]
2078   //   s_add_u32 s0, s0, $symbol
2079   //   s_addc_u32 s1, s1, 0
2080   //
2081   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2082   //   a fixup or relocation is emitted to replace $symbol with a literal
2083   //   constant, which is a pc-relative offset from the encoding of the $symbol
2084   //   operand to the global variable.
2085   //
2086   // For global address space:
2087   //   s_getpc_b64 s[0:1]
2088   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2089   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2090   //
2091   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2092   //   fixups or relocations are emitted to replace $symbol@*@lo and
2093   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2094   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2095   //   operand to the global variable.
2096   //
2097   // What we want here is an offset from the value returned by s_getpc
2098   // (which is the address of the s_add_u32 instruction) to the global
2099   // variable, but since the encoding of $symbol starts 4 bytes after the start
2100   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2101   // small. This requires us to add 4 to the global variable offset in order to
2102   // compute the correct address.
2103 
2104   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2105 
2106   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2107     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2108 
2109   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2110     .addDef(PCReg);
2111 
2112   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2113   if (GAFlags == SIInstrInfo::MO_NONE)
2114     MIB.addImm(0);
2115   else
2116     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2117 
2118   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2119 
2120   if (PtrTy.getSizeInBits() == 32)
2121     B.buildExtract(DstReg, PCReg, 0);
2122   return true;
2123  }
2124 
2125 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2126   MachineInstr &MI, MachineRegisterInfo &MRI,
2127   MachineIRBuilder &B) const {
2128   Register DstReg = MI.getOperand(0).getReg();
2129   LLT Ty = MRI.getType(DstReg);
2130   unsigned AS = Ty.getAddressSpace();
2131 
2132   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2133   MachineFunction &MF = B.getMF();
2134   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2135 
2136   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2137     if (!MFI->isEntryFunction()) {
2138       const Function &Fn = MF.getFunction();
2139       DiagnosticInfoUnsupported BadLDSDecl(
2140         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2141         DS_Warning);
2142       Fn.getContext().diagnose(BadLDSDecl);
2143 
2144       // We currently don't have a way to correctly allocate LDS objects that
2145       // aren't directly associated with a kernel. We do force inlining of
2146       // functions that use local objects. However, if these dead functions are
2147       // not eliminated, we don't want a compile time error. Just emit a warning
2148       // and a trap, since there should be no callable path here.
2149       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2150       B.buildUndef(DstReg);
2151       MI.eraseFromParent();
2152       return true;
2153     }
2154 
2155     // TODO: We could emit code to handle the initialization somewhere.
2156     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2157       const SITargetLowering *TLI = ST.getTargetLowering();
2158       if (!TLI->shouldUseLDSConstAddress(GV)) {
2159         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2160         return true; // Leave in place;
2161       }
2162 
2163       B.buildConstant(
2164           DstReg,
2165           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2166       MI.eraseFromParent();
2167       return true;
2168     }
2169 
2170     const Function &Fn = MF.getFunction();
2171     DiagnosticInfoUnsupported BadInit(
2172       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2173     Fn.getContext().diagnose(BadInit);
2174     return true;
2175   }
2176 
2177   const SITargetLowering *TLI = ST.getTargetLowering();
2178 
2179   if (TLI->shouldEmitFixup(GV)) {
2180     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2181     MI.eraseFromParent();
2182     return true;
2183   }
2184 
2185   if (TLI->shouldEmitPCReloc(GV)) {
2186     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2187     MI.eraseFromParent();
2188     return true;
2189   }
2190 
2191   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2192   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2193 
2194   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2195       MachinePointerInfo::getGOT(MF),
2196       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2197           MachineMemOperand::MOInvariant,
2198       8 /*Size*/, Align(8));
2199 
2200   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2201 
2202   if (Ty.getSizeInBits() == 32) {
2203     // Truncate if this is a 32-bit constant adrdess.
2204     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2205     B.buildExtract(DstReg, Load, 0);
2206   } else
2207     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2208 
2209   MI.eraseFromParent();
2210   return true;
2211 }
2212 
2213 bool AMDGPULegalizerInfo::legalizeLoad(
2214   MachineInstr &MI, MachineRegisterInfo &MRI,
2215   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2216   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2217   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2218   Observer.changingInstr(MI);
2219   MI.getOperand(1).setReg(Cast.getReg(0));
2220   Observer.changedInstr(MI);
2221   return true;
2222 }
2223 
2224 bool AMDGPULegalizerInfo::legalizeFMad(
2225   MachineInstr &MI, MachineRegisterInfo &MRI,
2226   MachineIRBuilder &B) const {
2227   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2228   assert(Ty.isScalar());
2229 
2230   MachineFunction &MF = B.getMF();
2231   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2232 
2233   // TODO: Always legal with future ftz flag.
2234   // FIXME: Do we need just output?
2235   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2236     return true;
2237   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2238     return true;
2239 
2240   MachineIRBuilder HelperBuilder(MI);
2241   GISelObserverWrapper DummyObserver;
2242   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2243   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2244 }
2245 
2246 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2247   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2248   Register DstReg = MI.getOperand(0).getReg();
2249   Register PtrReg = MI.getOperand(1).getReg();
2250   Register CmpVal = MI.getOperand(2).getReg();
2251   Register NewVal = MI.getOperand(3).getReg();
2252 
2253   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2254          "this should not have been custom lowered");
2255 
2256   LLT ValTy = MRI.getType(CmpVal);
2257   LLT VecTy = LLT::vector(2, ValTy);
2258 
2259   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2260 
2261   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2262     .addDef(DstReg)
2263     .addUse(PtrReg)
2264     .addUse(PackedVal)
2265     .setMemRefs(MI.memoperands());
2266 
2267   MI.eraseFromParent();
2268   return true;
2269 }
2270 
2271 bool AMDGPULegalizerInfo::legalizeFlog(
2272   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2273   Register Dst = MI.getOperand(0).getReg();
2274   Register Src = MI.getOperand(1).getReg();
2275   LLT Ty = B.getMRI()->getType(Dst);
2276   unsigned Flags = MI.getFlags();
2277 
2278   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2279   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2280 
2281   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2282   MI.eraseFromParent();
2283   return true;
2284 }
2285 
2286 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2287                                        MachineIRBuilder &B) const {
2288   Register Dst = MI.getOperand(0).getReg();
2289   Register Src = MI.getOperand(1).getReg();
2290   unsigned Flags = MI.getFlags();
2291   LLT Ty = B.getMRI()->getType(Dst);
2292 
2293   auto K = B.buildFConstant(Ty, numbers::log2e);
2294   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2295   B.buildFExp2(Dst, Mul, Flags);
2296   MI.eraseFromParent();
2297   return true;
2298 }
2299 
2300 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2301                                        MachineIRBuilder &B) const {
2302   Register Dst = MI.getOperand(0).getReg();
2303   Register Src0 = MI.getOperand(1).getReg();
2304   Register Src1 = MI.getOperand(2).getReg();
2305   unsigned Flags = MI.getFlags();
2306   LLT Ty = B.getMRI()->getType(Dst);
2307   const LLT S16 = LLT::scalar(16);
2308   const LLT S32 = LLT::scalar(32);
2309 
2310   if (Ty == S32) {
2311     auto Log = B.buildFLog2(S32, Src0, Flags);
2312     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2313       .addUse(Log.getReg(0))
2314       .addUse(Src1)
2315       .setMIFlags(Flags);
2316     B.buildFExp2(Dst, Mul, Flags);
2317   } else if (Ty == S16) {
2318     // There's no f16 fmul_legacy, so we need to convert for it.
2319     auto Log = B.buildFLog2(S16, Src0, Flags);
2320     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2321     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2322     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2323       .addUse(Ext0.getReg(0))
2324       .addUse(Ext1.getReg(0))
2325       .setMIFlags(Flags);
2326 
2327     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2328   } else
2329     return false;
2330 
2331   MI.eraseFromParent();
2332   return true;
2333 }
2334 
2335 // Find a source register, ignoring any possible source modifiers.
2336 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2337   Register ModSrc = OrigSrc;
2338   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2339     ModSrc = SrcFNeg->getOperand(1).getReg();
2340     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2341       ModSrc = SrcFAbs->getOperand(1).getReg();
2342   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2343     ModSrc = SrcFAbs->getOperand(1).getReg();
2344   return ModSrc;
2345 }
2346 
2347 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2348                                          MachineRegisterInfo &MRI,
2349                                          MachineIRBuilder &B) const {
2350 
2351   const LLT S1 = LLT::scalar(1);
2352   const LLT S64 = LLT::scalar(64);
2353   Register Dst = MI.getOperand(0).getReg();
2354   Register OrigSrc = MI.getOperand(1).getReg();
2355   unsigned Flags = MI.getFlags();
2356   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2357          "this should not have been custom lowered");
2358 
2359   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2360   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2361   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2362   // V_FRACT bug is:
2363   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2364   //
2365   // Convert floor(x) to (x - fract(x))
2366 
2367   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2368     .addUse(OrigSrc)
2369     .setMIFlags(Flags);
2370 
2371   // Give source modifier matching some assistance before obscuring a foldable
2372   // pattern.
2373 
2374   // TODO: We can avoid the neg on the fract? The input sign to fract
2375   // shouldn't matter?
2376   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2377 
2378   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2379 
2380   Register Min = MRI.createGenericVirtualRegister(S64);
2381 
2382   // We don't need to concern ourselves with the snan handling difference, so
2383   // use the one which will directly select.
2384   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2385   if (MFI->getMode().IEEE)
2386     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2387   else
2388     B.buildFMinNum(Min, Fract, Const, Flags);
2389 
2390   Register CorrectedFract = Min;
2391   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2392     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2393     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2394   }
2395 
2396   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2397   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2398 
2399   MI.eraseFromParent();
2400   return true;
2401 }
2402 
2403 // Turn an illegal packed v2s16 build vector into bit operations.
2404 // TODO: This should probably be a bitcast action in LegalizerHelper.
2405 bool AMDGPULegalizerInfo::legalizeBuildVector(
2406   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2407   Register Dst = MI.getOperand(0).getReg();
2408   const LLT S32 = LLT::scalar(32);
2409   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2410 
2411   Register Src0 = MI.getOperand(1).getReg();
2412   Register Src1 = MI.getOperand(2).getReg();
2413   assert(MRI.getType(Src0) == LLT::scalar(16));
2414 
2415   auto Merge = B.buildMerge(S32, {Src0, Src1});
2416   B.buildBitcast(Dst, Merge);
2417 
2418   MI.eraseFromParent();
2419   return true;
2420 }
2421 
2422 // Return the use branch instruction, otherwise null if the usage is invalid.
2423 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2424                                        MachineRegisterInfo &MRI,
2425                                        MachineInstr *&Br,
2426                                        MachineBasicBlock *&UncondBrTarget) {
2427   Register CondDef = MI.getOperand(0).getReg();
2428   if (!MRI.hasOneNonDBGUse(CondDef))
2429     return nullptr;
2430 
2431   MachineBasicBlock *Parent = MI.getParent();
2432   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2433   if (UseMI.getParent() != Parent ||
2434       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2435     return nullptr;
2436 
2437   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2438   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2439   if (Next == Parent->end()) {
2440     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2441     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2442       return nullptr;
2443     UncondBrTarget = &*NextMBB;
2444   } else {
2445     if (Next->getOpcode() != AMDGPU::G_BR)
2446       return nullptr;
2447     Br = &*Next;
2448     UncondBrTarget = Br->getOperand(0).getMBB();
2449   }
2450 
2451   return &UseMI;
2452 }
2453 
2454 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2455                                                MachineRegisterInfo &MRI,
2456                                                Register LiveIn,
2457                                                Register PhyReg) const {
2458   assert(PhyReg.isPhysical() && "Physical register expected");
2459 
2460   // Insert the live-in copy, if required, by defining destination virtual
2461   // register.
2462   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2463   if (!MRI.getVRegDef(LiveIn)) {
2464     // FIXME: Should have scoped insert pt
2465     MachineBasicBlock &OrigInsBB = B.getMBB();
2466     auto OrigInsPt = B.getInsertPt();
2467 
2468     MachineBasicBlock &EntryMBB = B.getMF().front();
2469     EntryMBB.addLiveIn(PhyReg);
2470     B.setInsertPt(EntryMBB, EntryMBB.begin());
2471     B.buildCopy(LiveIn, PhyReg);
2472 
2473     B.setInsertPt(OrigInsBB, OrigInsPt);
2474   }
2475 
2476   return LiveIn;
2477 }
2478 
2479 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2480                                                 MachineRegisterInfo &MRI,
2481                                                 Register PhyReg, LLT Ty,
2482                                                 bool InsertLiveInCopy) const {
2483   assert(PhyReg.isPhysical() && "Physical register expected");
2484 
2485   // Get or create virtual live-in regester
2486   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2487   if (!LiveIn) {
2488     LiveIn = MRI.createGenericVirtualRegister(Ty);
2489     MRI.addLiveIn(PhyReg, LiveIn);
2490   }
2491 
2492   // When the actual true copy required is from virtual register to physical
2493   // register (to be inserted later), live-in copy insertion from physical
2494   // to register virtual register is not required
2495   if (!InsertLiveInCopy)
2496     return LiveIn;
2497 
2498   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2499 }
2500 
2501 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2502                                          const ArgDescriptor *Arg,
2503                                          const TargetRegisterClass *ArgRC,
2504                                          LLT ArgTy) const {
2505   MCRegister SrcReg = Arg->getRegister();
2506   assert(SrcReg.isPhysical() && "Physical register expected");
2507   assert(DstReg.isVirtual() && "Virtual register expected");
2508 
2509   MachineRegisterInfo &MRI = *B.getMRI();
2510   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy);
2511 
2512   if (Arg->isMasked()) {
2513     // TODO: Should we try to emit this once in the entry block?
2514     const LLT S32 = LLT::scalar(32);
2515     const unsigned Mask = Arg->getMask();
2516     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2517 
2518     Register AndMaskSrc = LiveIn;
2519 
2520     if (Shift != 0) {
2521       auto ShiftAmt = B.buildConstant(S32, Shift);
2522       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2523     }
2524 
2525     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2526   } else {
2527     B.buildCopy(DstReg, LiveIn);
2528   }
2529 
2530   return true;
2531 }
2532 
2533 bool AMDGPULegalizerInfo::loadInputValue(
2534     Register DstReg, MachineIRBuilder &B,
2535     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2536   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2537   const ArgDescriptor *Arg;
2538   const TargetRegisterClass *ArgRC;
2539   LLT ArgTy;
2540   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2541 
2542   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2543     return false; // TODO: Handle these
2544   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2545 }
2546 
2547 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2548     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2549     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2550   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2551     return false;
2552 
2553   MI.eraseFromParent();
2554   return true;
2555 }
2556 
2557 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2558                                        MachineRegisterInfo &MRI,
2559                                        MachineIRBuilder &B) const {
2560   Register Dst = MI.getOperand(0).getReg();
2561   LLT DstTy = MRI.getType(Dst);
2562   LLT S16 = LLT::scalar(16);
2563   LLT S32 = LLT::scalar(32);
2564   LLT S64 = LLT::scalar(64);
2565 
2566   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2567     return true;
2568 
2569   if (DstTy == S16)
2570     return legalizeFDIV16(MI, MRI, B);
2571   if (DstTy == S32)
2572     return legalizeFDIV32(MI, MRI, B);
2573   if (DstTy == S64)
2574     return legalizeFDIV64(MI, MRI, B);
2575 
2576   return false;
2577 }
2578 
2579 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2580                                                   Register DstReg,
2581                                                   Register X,
2582                                                   Register Y,
2583                                                   bool IsDiv) const {
2584   const LLT S1 = LLT::scalar(1);
2585   const LLT S32 = LLT::scalar(32);
2586 
2587   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2588   // algorithm used here.
2589 
2590   // Initial estimate of inv(y).
2591   auto FloatY = B.buildUITOFP(S32, Y);
2592   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2593   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2594   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2595   auto Z = B.buildFPTOUI(S32, ScaledY);
2596 
2597   // One round of UNR.
2598   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2599   auto NegYZ = B.buildMul(S32, NegY, Z);
2600   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2601 
2602   // Quotient/remainder estimate.
2603   auto Q = B.buildUMulH(S32, X, Z);
2604   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2605 
2606   // First quotient/remainder refinement.
2607   auto One = B.buildConstant(S32, 1);
2608   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2609   if (IsDiv)
2610     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2611   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2612 
2613   // Second quotient/remainder refinement.
2614   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2615   if (IsDiv)
2616     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2617   else
2618     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2619 }
2620 
2621 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2622                                               MachineRegisterInfo &MRI,
2623                                               MachineIRBuilder &B) const {
2624   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2625   Register DstReg = MI.getOperand(0).getReg();
2626   Register Num = MI.getOperand(1).getReg();
2627   Register Den = MI.getOperand(2).getReg();
2628   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2629   MI.eraseFromParent();
2630   return true;
2631 }
2632 
2633 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2634 //
2635 // Return lo, hi of result
2636 //
2637 // %cvt.lo = G_UITOFP Val.lo
2638 // %cvt.hi = G_UITOFP Val.hi
2639 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2640 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2641 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2642 // %mul2 = G_FMUL %mul1, 2**(-32)
2643 // %trunc = G_INTRINSIC_TRUNC %mul2
2644 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2645 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2646 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2647                                                        Register Val) {
2648   const LLT S32 = LLT::scalar(32);
2649   auto Unmerge = B.buildUnmerge(S32, Val);
2650 
2651   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2652   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2653 
2654   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2655                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2656 
2657   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2658   auto Mul1 =
2659       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2660 
2661   // 2**(-32)
2662   auto Mul2 =
2663       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2664   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2665 
2666   // -(2**32)
2667   auto Mad2 = B.buildFMAD(S32, Trunc,
2668                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2669 
2670   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2671   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2672 
2673   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2674 }
2675 
2676 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2677                                                   Register DstReg,
2678                                                   Register Numer,
2679                                                   Register Denom,
2680                                                   bool IsDiv) const {
2681   const LLT S32 = LLT::scalar(32);
2682   const LLT S64 = LLT::scalar(64);
2683   const LLT S1 = LLT::scalar(1);
2684   Register RcpLo, RcpHi;
2685 
2686   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2687 
2688   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2689 
2690   auto Zero64 = B.buildConstant(S64, 0);
2691   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2692 
2693   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2694   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2695 
2696   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2697   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2698   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2699 
2700   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2701   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2702   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2703   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2704 
2705   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2706   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2707   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2708   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2709   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2710 
2711   auto Zero32 = B.buildConstant(S32, 0);
2712   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2713   auto Add2_HiC =
2714       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2715   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2716   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2717 
2718   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2719   Register NumerLo = UnmergeNumer.getReg(0);
2720   Register NumerHi = UnmergeNumer.getReg(1);
2721 
2722   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2723   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2724   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2725   Register Mul3_Lo = UnmergeMul3.getReg(0);
2726   Register Mul3_Hi = UnmergeMul3.getReg(1);
2727   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2728   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2729   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2730   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2731 
2732   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2733   Register DenomLo = UnmergeDenom.getReg(0);
2734   Register DenomHi = UnmergeDenom.getReg(1);
2735 
2736   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2737   auto C1 = B.buildSExt(S32, CmpHi);
2738 
2739   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2740   auto C2 = B.buildSExt(S32, CmpLo);
2741 
2742   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2743   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2744 
2745   // TODO: Here and below portions of the code can be enclosed into if/endif.
2746   // Currently control flow is unconditional and we have 4 selects after
2747   // potential endif to substitute PHIs.
2748 
2749   // if C3 != 0 ...
2750   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2751   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2752   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2753   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2754 
2755   auto One64 = B.buildConstant(S64, 1);
2756   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2757 
2758   auto C4 =
2759       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2760   auto C5 =
2761       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2762   auto C6 = B.buildSelect(
2763       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2764 
2765   // if (C6 != 0)
2766   auto Add4 = B.buildAdd(S64, Add3, One64);
2767   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2768 
2769   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2770   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2771   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2772 
2773   // endif C6
2774   // endif C3
2775 
2776   if (IsDiv) {
2777     auto Sel1 = B.buildSelect(
2778         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2779     B.buildSelect(DstReg,
2780                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2781   } else {
2782     auto Sel2 = B.buildSelect(
2783         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2784     B.buildSelect(DstReg,
2785                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2786   }
2787 }
2788 
2789 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2790                                             MachineRegisterInfo &MRI,
2791                                             MachineIRBuilder &B) const {
2792   const LLT S64 = LLT::scalar(64);
2793   const LLT S32 = LLT::scalar(32);
2794   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2795   Register DstReg = MI.getOperand(0).getReg();
2796   Register Num = MI.getOperand(1).getReg();
2797   Register Den = MI.getOperand(2).getReg();
2798   LLT Ty = MRI.getType(DstReg);
2799 
2800   if (Ty == S32)
2801     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2802   else if (Ty == S64)
2803     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2804   else
2805     return false;
2806 
2807   MI.eraseFromParent();
2808   return true;
2809 
2810 }
2811 
2812 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2813                                             MachineRegisterInfo &MRI,
2814                                             MachineIRBuilder &B) const {
2815   const LLT S64 = LLT::scalar(64);
2816   const LLT S32 = LLT::scalar(32);
2817 
2818   Register DstReg = MI.getOperand(0).getReg();
2819   const LLT Ty = MRI.getType(DstReg);
2820   if (Ty != S32 && Ty != S64)
2821     return false;
2822 
2823   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2824 
2825   Register LHS = MI.getOperand(1).getReg();
2826   Register RHS = MI.getOperand(2).getReg();
2827 
2828   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2829   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2830   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2831 
2832   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2833   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2834 
2835   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2836   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2837 
2838   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2839   if (Ty == S32)
2840     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2841   else
2842     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2843 
2844   Register Sign;
2845   if (IsDiv)
2846     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2847   else
2848     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2849 
2850   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2851   B.buildSub(DstReg, UDivRem, Sign);
2852 
2853   MI.eraseFromParent();
2854   return true;
2855 }
2856 
2857 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2858                                                  MachineRegisterInfo &MRI,
2859                                                  MachineIRBuilder &B) const {
2860   Register Res = MI.getOperand(0).getReg();
2861   Register LHS = MI.getOperand(1).getReg();
2862   Register RHS = MI.getOperand(2).getReg();
2863 
2864   uint16_t Flags = MI.getFlags();
2865 
2866   LLT ResTy = MRI.getType(Res);
2867   LLT S32 = LLT::scalar(32);
2868   LLT S64 = LLT::scalar(64);
2869 
2870   const MachineFunction &MF = B.getMF();
2871   bool Unsafe =
2872     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2873 
2874   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2875     return false;
2876 
2877   if (!Unsafe && ResTy == S32 &&
2878       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2879     return false;
2880 
2881   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2882     // 1 / x -> RCP(x)
2883     if (CLHS->isExactlyValue(1.0)) {
2884       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2885         .addUse(RHS)
2886         .setMIFlags(Flags);
2887 
2888       MI.eraseFromParent();
2889       return true;
2890     }
2891 
2892     // -1 / x -> RCP( FNEG(x) )
2893     if (CLHS->isExactlyValue(-1.0)) {
2894       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2895       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2896         .addUse(FNeg.getReg(0))
2897         .setMIFlags(Flags);
2898 
2899       MI.eraseFromParent();
2900       return true;
2901     }
2902   }
2903 
2904   // x / y -> x * (1.0 / y)
2905   if (Unsafe) {
2906     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2907       .addUse(RHS)
2908       .setMIFlags(Flags);
2909     B.buildFMul(Res, LHS, RCP, Flags);
2910 
2911     MI.eraseFromParent();
2912     return true;
2913   }
2914 
2915   return false;
2916 }
2917 
2918 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2919                                          MachineRegisterInfo &MRI,
2920                                          MachineIRBuilder &B) const {
2921   Register Res = MI.getOperand(0).getReg();
2922   Register LHS = MI.getOperand(1).getReg();
2923   Register RHS = MI.getOperand(2).getReg();
2924 
2925   uint16_t Flags = MI.getFlags();
2926 
2927   LLT S16 = LLT::scalar(16);
2928   LLT S32 = LLT::scalar(32);
2929 
2930   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2931   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2932 
2933   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2934     .addUse(RHSExt.getReg(0))
2935     .setMIFlags(Flags);
2936 
2937   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2938   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2939 
2940   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2941     .addUse(RDst.getReg(0))
2942     .addUse(RHS)
2943     .addUse(LHS)
2944     .setMIFlags(Flags);
2945 
2946   MI.eraseFromParent();
2947   return true;
2948 }
2949 
2950 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2951 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2952 static void toggleSPDenormMode(bool Enable,
2953                                MachineIRBuilder &B,
2954                                const GCNSubtarget &ST,
2955                                AMDGPU::SIModeRegisterDefaults Mode) {
2956   // Set SP denorm mode to this value.
2957   unsigned SPDenormMode =
2958     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2959 
2960   if (ST.hasDenormModeInst()) {
2961     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2962     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2963 
2964     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2965     B.buildInstr(AMDGPU::S_DENORM_MODE)
2966       .addImm(NewDenormModeValue);
2967 
2968   } else {
2969     // Select FP32 bit field in mode register.
2970     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2971                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2972                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2973 
2974     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2975       .addImm(SPDenormMode)
2976       .addImm(SPDenormModeBitField);
2977   }
2978 }
2979 
2980 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2981                                          MachineRegisterInfo &MRI,
2982                                          MachineIRBuilder &B) const {
2983   Register Res = MI.getOperand(0).getReg();
2984   Register LHS = MI.getOperand(1).getReg();
2985   Register RHS = MI.getOperand(2).getReg();
2986   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2987   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2988 
2989   uint16_t Flags = MI.getFlags();
2990 
2991   LLT S32 = LLT::scalar(32);
2992   LLT S1 = LLT::scalar(1);
2993 
2994   auto One = B.buildFConstant(S32, 1.0f);
2995 
2996   auto DenominatorScaled =
2997     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2998       .addUse(LHS)
2999       .addUse(RHS)
3000       .addImm(0)
3001       .setMIFlags(Flags);
3002   auto NumeratorScaled =
3003     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3004       .addUse(LHS)
3005       .addUse(RHS)
3006       .addImm(1)
3007       .setMIFlags(Flags);
3008 
3009   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3010     .addUse(DenominatorScaled.getReg(0))
3011     .setMIFlags(Flags);
3012   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3013 
3014   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3015   // aren't modeled as reading it.
3016   if (!Mode.allFP32Denormals())
3017     toggleSPDenormMode(true, B, ST, Mode);
3018 
3019   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3020   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3021   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3022   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3023   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3024   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3025 
3026   if (!Mode.allFP32Denormals())
3027     toggleSPDenormMode(false, B, ST, Mode);
3028 
3029   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3030     .addUse(Fma4.getReg(0))
3031     .addUse(Fma1.getReg(0))
3032     .addUse(Fma3.getReg(0))
3033     .addUse(NumeratorScaled.getReg(1))
3034     .setMIFlags(Flags);
3035 
3036   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3037     .addUse(Fmas.getReg(0))
3038     .addUse(RHS)
3039     .addUse(LHS)
3040     .setMIFlags(Flags);
3041 
3042   MI.eraseFromParent();
3043   return true;
3044 }
3045 
3046 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3047                                          MachineRegisterInfo &MRI,
3048                                          MachineIRBuilder &B) const {
3049   Register Res = MI.getOperand(0).getReg();
3050   Register LHS = MI.getOperand(1).getReg();
3051   Register RHS = MI.getOperand(2).getReg();
3052 
3053   uint16_t Flags = MI.getFlags();
3054 
3055   LLT S64 = LLT::scalar(64);
3056   LLT S1 = LLT::scalar(1);
3057 
3058   auto One = B.buildFConstant(S64, 1.0);
3059 
3060   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3061     .addUse(LHS)
3062     .addUse(RHS)
3063     .addImm(0)
3064     .setMIFlags(Flags);
3065 
3066   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3067 
3068   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3069     .addUse(DivScale0.getReg(0))
3070     .setMIFlags(Flags);
3071 
3072   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3073   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3074   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3075 
3076   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3077     .addUse(LHS)
3078     .addUse(RHS)
3079     .addImm(1)
3080     .setMIFlags(Flags);
3081 
3082   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3083   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3084   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3085 
3086   Register Scale;
3087   if (!ST.hasUsableDivScaleConditionOutput()) {
3088     // Workaround a hardware bug on SI where the condition output from div_scale
3089     // is not usable.
3090 
3091     LLT S32 = LLT::scalar(32);
3092 
3093     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3094     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3095     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3096     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3097 
3098     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3099                               Scale1Unmerge.getReg(1));
3100     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3101                               Scale0Unmerge.getReg(1));
3102     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3103   } else {
3104     Scale = DivScale1.getReg(1);
3105   }
3106 
3107   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3108     .addUse(Fma4.getReg(0))
3109     .addUse(Fma3.getReg(0))
3110     .addUse(Mul.getReg(0))
3111     .addUse(Scale)
3112     .setMIFlags(Flags);
3113 
3114   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3115     .addUse(Fmas.getReg(0))
3116     .addUse(RHS)
3117     .addUse(LHS)
3118     .setMIFlags(Flags);
3119 
3120   MI.eraseFromParent();
3121   return true;
3122 }
3123 
3124 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3125                                                  MachineRegisterInfo &MRI,
3126                                                  MachineIRBuilder &B) const {
3127   Register Res = MI.getOperand(0).getReg();
3128   Register LHS = MI.getOperand(2).getReg();
3129   Register RHS = MI.getOperand(3).getReg();
3130   uint16_t Flags = MI.getFlags();
3131 
3132   LLT S32 = LLT::scalar(32);
3133   LLT S1 = LLT::scalar(1);
3134 
3135   auto Abs = B.buildFAbs(S32, RHS, Flags);
3136   const APFloat C0Val(1.0f);
3137 
3138   auto C0 = B.buildConstant(S32, 0x6f800000);
3139   auto C1 = B.buildConstant(S32, 0x2f800000);
3140   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3141 
3142   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3143   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3144 
3145   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3146 
3147   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3148     .addUse(Mul0.getReg(0))
3149     .setMIFlags(Flags);
3150 
3151   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3152 
3153   B.buildFMul(Res, Sel, Mul1, Flags);
3154 
3155   MI.eraseFromParent();
3156   return true;
3157 }
3158 
3159 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3160                                             MachineRegisterInfo &MRI,
3161                                             MachineIRBuilder &B) const {
3162   uint64_t Offset =
3163     ST.getTargetLowering()->getImplicitParameterOffset(
3164       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3165   LLT DstTy = MRI.getType(DstReg);
3166   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3167 
3168   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3169   if (!loadInputValue(KernargPtrReg, B,
3170                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3171     return false;
3172 
3173   // FIXME: This should be nuw
3174   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3175   return true;
3176 }
3177 
3178 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3179                                                  MachineRegisterInfo &MRI,
3180                                                  MachineIRBuilder &B) const {
3181   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3182   if (!MFI->isEntryFunction()) {
3183     return legalizePreloadedArgIntrin(MI, MRI, B,
3184                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3185   }
3186 
3187   Register DstReg = MI.getOperand(0).getReg();
3188   if (!getImplicitArgPtr(DstReg, MRI, B))
3189     return false;
3190 
3191   MI.eraseFromParent();
3192   return true;
3193 }
3194 
3195 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3196                                               MachineRegisterInfo &MRI,
3197                                               MachineIRBuilder &B,
3198                                               unsigned AddrSpace) const {
3199   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3200   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3201   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3202   MI.eraseFromParent();
3203   return true;
3204 }
3205 
3206 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3207 // offset (the offset that is included in bounds checking and swizzling, to be
3208 // split between the instruction's voffset and immoffset fields) and soffset
3209 // (the offset that is excluded from bounds checking and swizzling, to go in
3210 // the instruction's soffset field).  This function takes the first kind of
3211 // offset and figures out how to split it between voffset and immoffset.
3212 std::tuple<Register, unsigned, unsigned>
3213 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3214                                         Register OrigOffset) const {
3215   const unsigned MaxImm = 4095;
3216   Register BaseReg;
3217   unsigned TotalConstOffset;
3218   MachineInstr *OffsetDef;
3219   const LLT S32 = LLT::scalar(32);
3220 
3221   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3222     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3223 
3224   unsigned ImmOffset = TotalConstOffset;
3225 
3226   // If the immediate value is too big for the immoffset field, put the value
3227   // and -4096 into the immoffset field so that the value that is copied/added
3228   // for the voffset field is a multiple of 4096, and it stands more chance
3229   // of being CSEd with the copy/add for another similar load/store.
3230   // However, do not do that rounding down to a multiple of 4096 if that is a
3231   // negative number, as it appears to be illegal to have a negative offset
3232   // in the vgpr, even if adding the immediate offset makes it positive.
3233   unsigned Overflow = ImmOffset & ~MaxImm;
3234   ImmOffset -= Overflow;
3235   if ((int32_t)Overflow < 0) {
3236     Overflow += ImmOffset;
3237     ImmOffset = 0;
3238   }
3239 
3240   if (Overflow != 0) {
3241     if (!BaseReg) {
3242       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3243     } else {
3244       auto OverflowVal = B.buildConstant(S32, Overflow);
3245       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3246     }
3247   }
3248 
3249   if (!BaseReg)
3250     BaseReg = B.buildConstant(S32, 0).getReg(0);
3251 
3252   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3253 }
3254 
3255 /// Handle register layout difference for f16 images for some subtargets.
3256 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3257                                              MachineRegisterInfo &MRI,
3258                                              Register Reg) const {
3259   if (!ST.hasUnpackedD16VMem())
3260     return Reg;
3261 
3262   const LLT S16 = LLT::scalar(16);
3263   const LLT S32 = LLT::scalar(32);
3264   LLT StoreVT = MRI.getType(Reg);
3265   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3266 
3267   auto Unmerge = B.buildUnmerge(S16, Reg);
3268 
3269   SmallVector<Register, 4> WideRegs;
3270   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3271     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3272 
3273   int NumElts = StoreVT.getNumElements();
3274 
3275   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3276 }
3277 
3278 Register AMDGPULegalizerInfo::fixStoreSourceType(
3279   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3280   MachineRegisterInfo *MRI = B.getMRI();
3281   LLT Ty = MRI->getType(VData);
3282 
3283   const LLT S16 = LLT::scalar(16);
3284 
3285   // Fixup illegal register types for i8 stores.
3286   if (Ty == LLT::scalar(8) || Ty == S16) {
3287     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3288     return AnyExt;
3289   }
3290 
3291   if (Ty.isVector()) {
3292     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3293       if (IsFormat)
3294         return handleD16VData(B, *MRI, VData);
3295     }
3296   }
3297 
3298   return VData;
3299 }
3300 
3301 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3302                                               MachineRegisterInfo &MRI,
3303                                               MachineIRBuilder &B,
3304                                               bool IsTyped,
3305                                               bool IsFormat) const {
3306   Register VData = MI.getOperand(1).getReg();
3307   LLT Ty = MRI.getType(VData);
3308   LLT EltTy = Ty.getScalarType();
3309   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3310   const LLT S32 = LLT::scalar(32);
3311 
3312   VData = fixStoreSourceType(B, VData, IsFormat);
3313   Register RSrc = MI.getOperand(2).getReg();
3314 
3315   MachineMemOperand *MMO = *MI.memoperands_begin();
3316   const int MemSize = MMO->getSize();
3317 
3318   unsigned ImmOffset;
3319   unsigned TotalOffset;
3320 
3321   // The typed intrinsics add an immediate after the registers.
3322   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3323 
3324   // The struct intrinsic variants add one additional operand over raw.
3325   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3326   Register VIndex;
3327   int OpOffset = 0;
3328   if (HasVIndex) {
3329     VIndex = MI.getOperand(3).getReg();
3330     OpOffset = 1;
3331   }
3332 
3333   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3334   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3335 
3336   unsigned Format = 0;
3337   if (IsTyped) {
3338     Format = MI.getOperand(5 + OpOffset).getImm();
3339     ++OpOffset;
3340   }
3341 
3342   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3343 
3344   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3345   if (TotalOffset != 0)
3346     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3347 
3348   unsigned Opc;
3349   if (IsTyped) {
3350     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3351                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3352   } else if (IsFormat) {
3353     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3354                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3355   } else {
3356     switch (MemSize) {
3357     case 1:
3358       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3359       break;
3360     case 2:
3361       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3362       break;
3363     default:
3364       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3365       break;
3366     }
3367   }
3368 
3369   if (!VIndex)
3370     VIndex = B.buildConstant(S32, 0).getReg(0);
3371 
3372   auto MIB = B.buildInstr(Opc)
3373     .addUse(VData)              // vdata
3374     .addUse(RSrc)               // rsrc
3375     .addUse(VIndex)             // vindex
3376     .addUse(VOffset)            // voffset
3377     .addUse(SOffset)            // soffset
3378     .addImm(ImmOffset);         // offset(imm)
3379 
3380   if (IsTyped)
3381     MIB.addImm(Format);
3382 
3383   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3384      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3385      .addMemOperand(MMO);
3386 
3387   MI.eraseFromParent();
3388   return true;
3389 }
3390 
3391 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3392                                              MachineRegisterInfo &MRI,
3393                                              MachineIRBuilder &B,
3394                                              bool IsFormat,
3395                                              bool IsTyped) const {
3396   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3397   MachineMemOperand *MMO = *MI.memoperands_begin();
3398   const int MemSize = MMO->getSize();
3399   const LLT S32 = LLT::scalar(32);
3400 
3401   Register Dst = MI.getOperand(0).getReg();
3402   Register RSrc = MI.getOperand(2).getReg();
3403 
3404   // The typed intrinsics add an immediate after the registers.
3405   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3406 
3407   // The struct intrinsic variants add one additional operand over raw.
3408   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3409   Register VIndex;
3410   int OpOffset = 0;
3411   if (HasVIndex) {
3412     VIndex = MI.getOperand(3).getReg();
3413     OpOffset = 1;
3414   }
3415 
3416   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3417   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3418 
3419   unsigned Format = 0;
3420   if (IsTyped) {
3421     Format = MI.getOperand(5 + OpOffset).getImm();
3422     ++OpOffset;
3423   }
3424 
3425   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3426   unsigned ImmOffset;
3427   unsigned TotalOffset;
3428 
3429   LLT Ty = MRI.getType(Dst);
3430   LLT EltTy = Ty.getScalarType();
3431   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3432   const bool Unpacked = ST.hasUnpackedD16VMem();
3433 
3434   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3435   if (TotalOffset != 0)
3436     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3437 
3438   unsigned Opc;
3439 
3440   if (IsTyped) {
3441     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3442                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3443   } else if (IsFormat) {
3444     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3445                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3446   } else {
3447     switch (MemSize) {
3448     case 1:
3449       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3450       break;
3451     case 2:
3452       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3453       break;
3454     default:
3455       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3456       break;
3457     }
3458   }
3459 
3460   Register LoadDstReg;
3461 
3462   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3463   LLT UnpackedTy = Ty.changeElementSize(32);
3464 
3465   if (IsExtLoad)
3466     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3467   else if (Unpacked && IsD16 && Ty.isVector())
3468     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3469   else
3470     LoadDstReg = Dst;
3471 
3472   if (!VIndex)
3473     VIndex = B.buildConstant(S32, 0).getReg(0);
3474 
3475   auto MIB = B.buildInstr(Opc)
3476     .addDef(LoadDstReg)         // vdata
3477     .addUse(RSrc)               // rsrc
3478     .addUse(VIndex)             // vindex
3479     .addUse(VOffset)            // voffset
3480     .addUse(SOffset)            // soffset
3481     .addImm(ImmOffset);         // offset(imm)
3482 
3483   if (IsTyped)
3484     MIB.addImm(Format);
3485 
3486   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3487      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3488      .addMemOperand(MMO);
3489 
3490   if (LoadDstReg != Dst) {
3491     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3492 
3493     // Widen result for extending loads was widened.
3494     if (IsExtLoad)
3495       B.buildTrunc(Dst, LoadDstReg);
3496     else {
3497       // Repack to original 16-bit vector result
3498       // FIXME: G_TRUNC should work, but legalization currently fails
3499       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3500       SmallVector<Register, 4> Repack;
3501       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3502         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3503       B.buildMerge(Dst, Repack);
3504     }
3505   }
3506 
3507   MI.eraseFromParent();
3508   return true;
3509 }
3510 
3511 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3512                                                MachineIRBuilder &B,
3513                                                bool IsInc) const {
3514   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3515                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3516   B.buildInstr(Opc)
3517     .addDef(MI.getOperand(0).getReg())
3518     .addUse(MI.getOperand(2).getReg())
3519     .addUse(MI.getOperand(3).getReg())
3520     .cloneMemRefs(MI);
3521   MI.eraseFromParent();
3522   return true;
3523 }
3524 
3525 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3526   switch (IntrID) {
3527   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3528   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3529     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3530   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3531   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3532     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3533   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3534   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3535     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3536   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3537   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3538     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3539   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3540   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3541     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3542   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3543   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3544     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3545   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3546   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3547     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3548   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3549   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3550     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3551   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3552   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3553     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3554   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3555   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3556     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3557   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3558   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3559     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3560   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3561   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3562     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3563   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3564   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3565     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3566   default:
3567     llvm_unreachable("unhandled atomic opcode");
3568   }
3569 }
3570 
3571 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3572                                                MachineIRBuilder &B,
3573                                                Intrinsic::ID IID) const {
3574   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3575                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3576 
3577   Register Dst = MI.getOperand(0).getReg();
3578   Register VData = MI.getOperand(2).getReg();
3579 
3580   Register CmpVal;
3581   int OpOffset = 0;
3582 
3583   if (IsCmpSwap) {
3584     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3585     ++OpOffset;
3586   }
3587 
3588   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3589   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3590 
3591   // The struct intrinsic variants add one additional operand over raw.
3592   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3593   Register VIndex;
3594   if (HasVIndex) {
3595     VIndex = MI.getOperand(4 + OpOffset).getReg();
3596     ++OpOffset;
3597   }
3598 
3599   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3600   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3601   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3602 
3603   MachineMemOperand *MMO = *MI.memoperands_begin();
3604 
3605   unsigned ImmOffset;
3606   unsigned TotalOffset;
3607   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3608   if (TotalOffset != 0)
3609     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3610 
3611   if (!VIndex)
3612     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3613 
3614   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3615     .addDef(Dst)
3616     .addUse(VData); // vdata
3617 
3618   if (IsCmpSwap)
3619     MIB.addReg(CmpVal);
3620 
3621   MIB.addUse(RSrc)               // rsrc
3622      .addUse(VIndex)             // vindex
3623      .addUse(VOffset)            // voffset
3624      .addUse(SOffset)            // soffset
3625      .addImm(ImmOffset)          // offset(imm)
3626      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3627      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3628      .addMemOperand(MMO);
3629 
3630   MI.eraseFromParent();
3631   return true;
3632 }
3633 
3634 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3635 /// vector with s16 typed elements.
3636 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3637                                         SmallVectorImpl<Register> &PackedAddrs,
3638                                         int AddrIdx, int DimIdx, int EndIdx,
3639                                         int NumGradients) {
3640   const LLT S16 = LLT::scalar(16);
3641   const LLT V2S16 = LLT::vector(2, 16);
3642 
3643   for (int I = AddrIdx; I < EndIdx; ++I) {
3644     MachineOperand &SrcOp = MI.getOperand(I);
3645     if (!SrcOp.isReg())
3646       continue; // _L to _LZ may have eliminated this.
3647 
3648     Register AddrReg = SrcOp.getReg();
3649 
3650     if (I < DimIdx) {
3651       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3652       PackedAddrs.push_back(AddrReg);
3653     } else {
3654       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3655       // derivatives dx/dh and dx/dv are packed with undef.
3656       if (((I + 1) >= EndIdx) ||
3657           ((NumGradients / 2) % 2 == 1 &&
3658            (I == DimIdx + (NumGradients / 2) - 1 ||
3659             I == DimIdx + NumGradients - 1)) ||
3660           // Check for _L to _LZ optimization
3661           !MI.getOperand(I + 1).isReg()) {
3662         PackedAddrs.push_back(
3663             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3664                 .getReg(0));
3665       } else {
3666         PackedAddrs.push_back(
3667             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3668                 .getReg(0));
3669         ++I;
3670       }
3671     }
3672   }
3673 }
3674 
3675 /// Convert from separate vaddr components to a single vector address register,
3676 /// and replace the remaining operands with $noreg.
3677 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3678                                      int DimIdx, int NumVAddrs) {
3679   const LLT S32 = LLT::scalar(32);
3680 
3681   SmallVector<Register, 8> AddrRegs;
3682   for (int I = 0; I != NumVAddrs; ++I) {
3683     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3684     if (SrcOp.isReg()) {
3685       AddrRegs.push_back(SrcOp.getReg());
3686       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3687     }
3688   }
3689 
3690   int NumAddrRegs = AddrRegs.size();
3691   if (NumAddrRegs != 1) {
3692     // Round up to 8 elements for v5-v7
3693     // FIXME: Missing intermediate sized register classes and instructions.
3694     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3695       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3696       auto Undef = B.buildUndef(S32);
3697       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3698       NumAddrRegs = RoundedNumRegs;
3699     }
3700 
3701     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3702     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3703   }
3704 
3705   for (int I = 1; I != NumVAddrs; ++I) {
3706     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3707     if (SrcOp.isReg())
3708       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3709   }
3710 }
3711 
3712 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3713 ///
3714 /// Depending on the subtarget, load/store with 16-bit element data need to be
3715 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3716 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3717 /// registers.
3718 ///
3719 /// We don't want to directly select image instructions just yet, but also want
3720 /// to exposes all register repacking to the legalizer/combiners. We also don't
3721 /// want a selected instrution entering RegBankSelect. In order to avoid
3722 /// defining a multitude of intermediate image instructions, directly hack on
3723 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3724 /// now unnecessary arguments with $noreg.
3725 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3726     MachineInstr &MI, MachineIRBuilder &B,
3727     GISelChangeObserver &Observer,
3728     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3729 
3730   const int NumDefs = MI.getNumExplicitDefs();
3731   bool IsTFE = NumDefs == 2;
3732   // We are only processing the operands of d16 image operations on subtargets
3733   // that use the unpacked register layout, or need to repack the TFE result.
3734 
3735   // TODO: Do we need to guard against already legalized intrinsics?
3736   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3737     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3738 
3739   MachineRegisterInfo *MRI = B.getMRI();
3740   const LLT S32 = LLT::scalar(32);
3741   const LLT S16 = LLT::scalar(16);
3742   const LLT V2S16 = LLT::vector(2, 16);
3743 
3744   // Index of first address argument
3745   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3746 
3747   int NumVAddrs, NumGradients;
3748   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3749   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3750     getDMaskIdx(BaseOpcode, NumDefs);
3751   unsigned DMask = 0;
3752 
3753   // Check for 16 bit addresses and pack if true.
3754   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3755   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3756   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3757   const bool IsG16 = GradTy == S16;
3758   const bool IsA16 = AddrTy == S16;
3759 
3760   int DMaskLanes = 0;
3761   if (!BaseOpcode->Atomic) {
3762     DMask = MI.getOperand(DMaskIdx).getImm();
3763     if (BaseOpcode->Gather4) {
3764       DMaskLanes = 4;
3765     } else if (DMask != 0) {
3766       DMaskLanes = countPopulation(DMask);
3767     } else if (!IsTFE && !BaseOpcode->Store) {
3768       // If dmask is 0, this is a no-op load. This can be eliminated.
3769       B.buildUndef(MI.getOperand(0));
3770       MI.eraseFromParent();
3771       return true;
3772     }
3773   }
3774 
3775   Observer.changingInstr(MI);
3776   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3777 
3778   unsigned NewOpcode = NumDefs == 0 ?
3779     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3780 
3781   // Track that we legalized this
3782   MI.setDesc(B.getTII().get(NewOpcode));
3783 
3784   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3785   // dmask to be at least 1 otherwise the instruction will fail
3786   if (IsTFE && DMask == 0) {
3787     DMask = 0x1;
3788     DMaskLanes = 1;
3789     MI.getOperand(DMaskIdx).setImm(DMask);
3790   }
3791 
3792   if (BaseOpcode->Atomic) {
3793     Register VData0 = MI.getOperand(2).getReg();
3794     LLT Ty = MRI->getType(VData0);
3795 
3796     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3797     if (Ty.isVector())
3798       return false;
3799 
3800     if (BaseOpcode->AtomicX2) {
3801       Register VData1 = MI.getOperand(3).getReg();
3802       // The two values are packed in one register.
3803       LLT PackedTy = LLT::vector(2, Ty);
3804       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3805       MI.getOperand(2).setReg(Concat.getReg(0));
3806       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3807     }
3808   }
3809 
3810   int CorrectedNumVAddrs = NumVAddrs;
3811 
3812   // Optimize _L to _LZ when _L is zero
3813   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3814         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3815     const ConstantFP *ConstantLod;
3816     const int LodIdx = AddrIdx + NumVAddrs - 1;
3817 
3818     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3819       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3820         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3821         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3822           LZMappingInfo->LZ, ImageDimIntr->Dim);
3823 
3824         // The starting indexes should remain in the same place.
3825         --NumVAddrs;
3826         --CorrectedNumVAddrs;
3827 
3828         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3829           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3830         MI.RemoveOperand(LodIdx);
3831       }
3832     }
3833   }
3834 
3835   // Optimize _mip away, when 'lod' is zero
3836   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3837     int64_t ConstantLod;
3838     const int LodIdx = AddrIdx + NumVAddrs - 1;
3839 
3840     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3841       if (ConstantLod == 0) {
3842         // TODO: Change intrinsic opcode and remove operand instead or replacing
3843         // it with 0, as the _L to _LZ handling is done above.
3844         MI.getOperand(LodIdx).ChangeToImmediate(0);
3845         --CorrectedNumVAddrs;
3846       }
3847     }
3848   }
3849 
3850   // Rewrite the addressing register layout before doing anything else.
3851   if (IsA16 || IsG16) {
3852     if (IsA16) {
3853       // Target must support the feature and gradients need to be 16 bit too
3854       if (!ST.hasA16() || !IsG16)
3855         return false;
3856     } else if (!ST.hasG16())
3857       return false;
3858 
3859     if (NumVAddrs > 1) {
3860       SmallVector<Register, 4> PackedRegs;
3861       // Don't compress addresses for G16
3862       const int PackEndIdx =
3863           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3864       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3865                                   PackEndIdx, NumGradients);
3866 
3867       if (!IsA16) {
3868         // Add uncompressed address
3869         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3870           int AddrReg = MI.getOperand(I).getReg();
3871           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3872           PackedRegs.push_back(AddrReg);
3873         }
3874       }
3875 
3876       // See also below in the non-a16 branch
3877       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3878 
3879       if (!UseNSA && PackedRegs.size() > 1) {
3880         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3881         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3882         PackedRegs[0] = Concat.getReg(0);
3883         PackedRegs.resize(1);
3884       }
3885 
3886       const int NumPacked = PackedRegs.size();
3887       for (int I = 0; I != NumVAddrs; ++I) {
3888         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3889         if (!SrcOp.isReg()) {
3890           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3891           continue;
3892         }
3893 
3894         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3895 
3896         if (I < NumPacked)
3897           SrcOp.setReg(PackedRegs[I]);
3898         else
3899           SrcOp.setReg(AMDGPU::NoRegister);
3900       }
3901     }
3902   } else {
3903     // If the register allocator cannot place the address registers contiguously
3904     // without introducing moves, then using the non-sequential address encoding
3905     // is always preferable, since it saves VALU instructions and is usually a
3906     // wash in terms of code size or even better.
3907     //
3908     // However, we currently have no way of hinting to the register allocator
3909     // that MIMG addresses should be placed contiguously when it is possible to
3910     // do so, so force non-NSA for the common 2-address case as a heuristic.
3911     //
3912     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3913     // allocation when possible.
3914     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3915 
3916     if (!UseNSA && NumVAddrs > 1)
3917       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3918   }
3919 
3920   int Flags = 0;
3921   if (IsA16)
3922     Flags |= 1;
3923   if (IsG16)
3924     Flags |= 2;
3925   MI.addOperand(MachineOperand::CreateImm(Flags));
3926 
3927   if (BaseOpcode->Store) { // No TFE for stores?
3928     // TODO: Handle dmask trim
3929     Register VData = MI.getOperand(1).getReg();
3930     LLT Ty = MRI->getType(VData);
3931     if (!Ty.isVector() || Ty.getElementType() != S16)
3932       return true;
3933 
3934     Register RepackedReg = handleD16VData(B, *MRI, VData);
3935     if (RepackedReg != VData) {
3936       MI.getOperand(1).setReg(RepackedReg);
3937     }
3938 
3939     return true;
3940   }
3941 
3942   Register DstReg = MI.getOperand(0).getReg();
3943   LLT Ty = MRI->getType(DstReg);
3944   const LLT EltTy = Ty.getScalarType();
3945   const bool IsD16 = Ty.getScalarType() == S16;
3946   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3947 
3948   // Confirm that the return type is large enough for the dmask specified
3949   if (NumElts < DMaskLanes)
3950     return false;
3951 
3952   if (NumElts > 4 || DMaskLanes > 4)
3953     return false;
3954 
3955   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3956   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3957 
3958   // The raw dword aligned data component of the load. The only legal cases
3959   // where this matters should be when using the packed D16 format, for
3960   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3961   LLT RoundedTy;
3962 
3963   // S32 vector to to cover all data, plus TFE result element.
3964   LLT TFETy;
3965 
3966   // Register type to use for each loaded component. Will be S32 or V2S16.
3967   LLT RegTy;
3968 
3969   if (IsD16 && ST.hasUnpackedD16VMem()) {
3970     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3971     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3972     RegTy = S32;
3973   } else {
3974     unsigned EltSize = EltTy.getSizeInBits();
3975     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3976     unsigned RoundedSize = 32 * RoundedElts;
3977     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3978     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3979     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3980   }
3981 
3982   // The return type does not need adjustment.
3983   // TODO: Should we change s16 case to s32 or <2 x s16>?
3984   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3985     return true;
3986 
3987   Register Dst1Reg;
3988 
3989   // Insert after the instruction.
3990   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3991 
3992   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3993   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3994   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3995   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3996 
3997   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3998 
3999   MI.getOperand(0).setReg(NewResultReg);
4000 
4001   // In the IR, TFE is supposed to be used with a 2 element struct return
4002   // type. The intruction really returns these two values in one contiguous
4003   // register, with one additional dword beyond the loaded data. Rewrite the
4004   // return type to use a single register result.
4005 
4006   if (IsTFE) {
4007     Dst1Reg = MI.getOperand(1).getReg();
4008     if (MRI->getType(Dst1Reg) != S32)
4009       return false;
4010 
4011     // TODO: Make sure the TFE operand bit is set.
4012     MI.RemoveOperand(1);
4013 
4014     // Handle the easy case that requires no repack instructions.
4015     if (Ty == S32) {
4016       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4017       return true;
4018     }
4019   }
4020 
4021   // Now figure out how to copy the new result register back into the old
4022   // result.
4023   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4024 
4025   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4026 
4027   if (ResultNumRegs == 1) {
4028     assert(!IsTFE);
4029     ResultRegs[0] = NewResultReg;
4030   } else {
4031     // We have to repack into a new vector of some kind.
4032     for (int I = 0; I != NumDataRegs; ++I)
4033       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4034     B.buildUnmerge(ResultRegs, NewResultReg);
4035 
4036     // Drop the final TFE element to get the data part. The TFE result is
4037     // directly written to the right place already.
4038     if (IsTFE)
4039       ResultRegs.resize(NumDataRegs);
4040   }
4041 
4042   // For an s16 scalar result, we form an s32 result with a truncate regardless
4043   // of packed vs. unpacked.
4044   if (IsD16 && !Ty.isVector()) {
4045     B.buildTrunc(DstReg, ResultRegs[0]);
4046     return true;
4047   }
4048 
4049   // Avoid a build/concat_vector of 1 entry.
4050   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4051     B.buildBitcast(DstReg, ResultRegs[0]);
4052     return true;
4053   }
4054 
4055   assert(Ty.isVector());
4056 
4057   if (IsD16) {
4058     // For packed D16 results with TFE enabled, all the data components are
4059     // S32. Cast back to the expected type.
4060     //
4061     // TODO: We don't really need to use load s32 elements. We would only need one
4062     // cast for the TFE result if a multiple of v2s16 was used.
4063     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4064       for (Register &Reg : ResultRegs)
4065         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4066     } else if (ST.hasUnpackedD16VMem()) {
4067       for (Register &Reg : ResultRegs)
4068         Reg = B.buildTrunc(S16, Reg).getReg(0);
4069     }
4070   }
4071 
4072   auto padWithUndef = [&](LLT Ty, int NumElts) {
4073     if (NumElts == 0)
4074       return;
4075     Register Undef = B.buildUndef(Ty).getReg(0);
4076     for (int I = 0; I != NumElts; ++I)
4077       ResultRegs.push_back(Undef);
4078   };
4079 
4080   // Pad out any elements eliminated due to the dmask.
4081   LLT ResTy = MRI->getType(ResultRegs[0]);
4082   if (!ResTy.isVector()) {
4083     padWithUndef(ResTy, NumElts - ResultRegs.size());
4084     B.buildBuildVector(DstReg, ResultRegs);
4085     return true;
4086   }
4087 
4088   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4089   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4090 
4091   // Deal with the one annoying legal case.
4092   const LLT V3S16 = LLT::vector(3, 16);
4093   if (Ty == V3S16) {
4094     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4095     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4096     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4097     return true;
4098   }
4099 
4100   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4101   B.buildConcatVectors(DstReg, ResultRegs);
4102   return true;
4103 }
4104 
4105 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4106   MachineInstr &MI, MachineIRBuilder &B,
4107   GISelChangeObserver &Observer) const {
4108   Register Dst = MI.getOperand(0).getReg();
4109   LLT Ty = B.getMRI()->getType(Dst);
4110   unsigned Size = Ty.getSizeInBits();
4111   MachineFunction &MF = B.getMF();
4112 
4113   Observer.changingInstr(MI);
4114 
4115   // FIXME: We don't really need this intermediate instruction. The intrinsic
4116   // should be fixed to have a memory operand. Since it's readnone, we're not
4117   // allowed to add one.
4118   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4119   MI.RemoveOperand(1); // Remove intrinsic ID
4120 
4121   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4122   // TODO: Should this use datalayout alignment?
4123   const unsigned MemSize = (Size + 7) / 8;
4124   const Align MemAlign(4);
4125   MachineMemOperand *MMO = MF.getMachineMemOperand(
4126       MachinePointerInfo(),
4127       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4128           MachineMemOperand::MOInvariant,
4129       MemSize, MemAlign);
4130   MI.addMemOperand(MF, MMO);
4131 
4132   // There are no 96-bit result scalar loads, but widening to 128-bit should
4133   // always be legal. We may need to restore this to a 96-bit result if it turns
4134   // out this needs to be converted to a vector load during RegBankSelect.
4135   if (!isPowerOf2_32(Size)) {
4136     LegalizerHelper Helper(MF, *this, Observer, B);
4137 
4138     if (Ty.isVector())
4139       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4140     else
4141       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4142   }
4143 
4144   Observer.changedInstr(MI);
4145   return true;
4146 }
4147 
4148 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4149                                                 MachineRegisterInfo &MRI,
4150                                                 MachineIRBuilder &B) const {
4151   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4152   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4153       !ST.isTrapHandlerEnabled()) {
4154     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4155   } else {
4156     // Pass queue pointer to trap handler as input, and insert trap instruction
4157     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4158     MachineRegisterInfo &MRI = *B.getMRI();
4159     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4160     Register LiveIn = getLiveInRegister(
4161         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4162         /*InsertLiveInCopy=*/false);
4163     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4164       return false;
4165     B.buildCopy(SGPR01, LiveIn);
4166     B.buildInstr(AMDGPU::S_TRAP)
4167         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4168         .addReg(SGPR01, RegState::Implicit);
4169   }
4170 
4171   MI.eraseFromParent();
4172   return true;
4173 }
4174 
4175 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4176     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4177   // Is non-HSA path or trap-handler disabled? then, report a warning
4178   // accordingly
4179   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4180       !ST.isTrapHandlerEnabled()) {
4181     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4182                                      "debugtrap handler not supported",
4183                                      MI.getDebugLoc(), DS_Warning);
4184     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4185     Ctx.diagnose(NoTrap);
4186   } else {
4187     // Insert debug-trap instruction
4188     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4189   }
4190 
4191   MI.eraseFromParent();
4192   return true;
4193 }
4194 
4195 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4196                                             MachineInstr &MI) const {
4197   MachineIRBuilder &B = Helper.MIRBuilder;
4198   MachineRegisterInfo &MRI = *B.getMRI();
4199 
4200   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4201   auto IntrID = MI.getIntrinsicID();
4202   switch (IntrID) {
4203   case Intrinsic::amdgcn_if:
4204   case Intrinsic::amdgcn_else: {
4205     MachineInstr *Br = nullptr;
4206     MachineBasicBlock *UncondBrTarget = nullptr;
4207     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4208       const SIRegisterInfo *TRI
4209         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4210 
4211       Register Def = MI.getOperand(1).getReg();
4212       Register Use = MI.getOperand(3).getReg();
4213 
4214       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4215       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4216       if (IntrID == Intrinsic::amdgcn_if) {
4217         B.buildInstr(AMDGPU::SI_IF)
4218           .addDef(Def)
4219           .addUse(Use)
4220           .addMBB(UncondBrTarget);
4221       } else {
4222         B.buildInstr(AMDGPU::SI_ELSE)
4223           .addDef(Def)
4224           .addUse(Use)
4225           .addMBB(UncondBrTarget)
4226           .addImm(0);
4227       }
4228 
4229       if (Br) {
4230         Br->getOperand(0).setMBB(CondBrTarget);
4231       } else {
4232         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4233         // since we're swapping branch targets it needs to be reinserted.
4234         // FIXME: IRTranslator should probably not do this
4235         B.buildBr(*CondBrTarget);
4236       }
4237 
4238       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4239       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4240       MI.eraseFromParent();
4241       BrCond->eraseFromParent();
4242       return true;
4243     }
4244 
4245     return false;
4246   }
4247   case Intrinsic::amdgcn_loop: {
4248     MachineInstr *Br = nullptr;
4249     MachineBasicBlock *UncondBrTarget = nullptr;
4250     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4251       const SIRegisterInfo *TRI
4252         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4253 
4254       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4255       Register Reg = MI.getOperand(2).getReg();
4256 
4257       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4258       B.buildInstr(AMDGPU::SI_LOOP)
4259         .addUse(Reg)
4260         .addMBB(UncondBrTarget);
4261 
4262       if (Br)
4263         Br->getOperand(0).setMBB(CondBrTarget);
4264       else
4265         B.buildBr(*CondBrTarget);
4266 
4267       MI.eraseFromParent();
4268       BrCond->eraseFromParent();
4269       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4270       return true;
4271     }
4272 
4273     return false;
4274   }
4275   case Intrinsic::amdgcn_kernarg_segment_ptr:
4276     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4277       // This only makes sense to call in a kernel, so just lower to null.
4278       B.buildConstant(MI.getOperand(0).getReg(), 0);
4279       MI.eraseFromParent();
4280       return true;
4281     }
4282 
4283     return legalizePreloadedArgIntrin(
4284       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4285   case Intrinsic::amdgcn_implicitarg_ptr:
4286     return legalizeImplicitArgPtr(MI, MRI, B);
4287   case Intrinsic::amdgcn_workitem_id_x:
4288     return legalizePreloadedArgIntrin(MI, MRI, B,
4289                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4290   case Intrinsic::amdgcn_workitem_id_y:
4291     return legalizePreloadedArgIntrin(MI, MRI, B,
4292                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4293   case Intrinsic::amdgcn_workitem_id_z:
4294     return legalizePreloadedArgIntrin(MI, MRI, B,
4295                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4296   case Intrinsic::amdgcn_workgroup_id_x:
4297     return legalizePreloadedArgIntrin(MI, MRI, B,
4298                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4299   case Intrinsic::amdgcn_workgroup_id_y:
4300     return legalizePreloadedArgIntrin(MI, MRI, B,
4301                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4302   case Intrinsic::amdgcn_workgroup_id_z:
4303     return legalizePreloadedArgIntrin(MI, MRI, B,
4304                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4305   case Intrinsic::amdgcn_dispatch_ptr:
4306     return legalizePreloadedArgIntrin(MI, MRI, B,
4307                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4308   case Intrinsic::amdgcn_queue_ptr:
4309     return legalizePreloadedArgIntrin(MI, MRI, B,
4310                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4311   case Intrinsic::amdgcn_implicit_buffer_ptr:
4312     return legalizePreloadedArgIntrin(
4313       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4314   case Intrinsic::amdgcn_dispatch_id:
4315     return legalizePreloadedArgIntrin(MI, MRI, B,
4316                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4317   case Intrinsic::amdgcn_fdiv_fast:
4318     return legalizeFDIVFastIntrin(MI, MRI, B);
4319   case Intrinsic::amdgcn_is_shared:
4320     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4321   case Intrinsic::amdgcn_is_private:
4322     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4323   case Intrinsic::amdgcn_wavefrontsize: {
4324     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4325     MI.eraseFromParent();
4326     return true;
4327   }
4328   case Intrinsic::amdgcn_s_buffer_load:
4329     return legalizeSBufferLoad(MI, B, Helper.Observer);
4330   case Intrinsic::amdgcn_raw_buffer_store:
4331   case Intrinsic::amdgcn_struct_buffer_store:
4332     return legalizeBufferStore(MI, MRI, B, false, false);
4333   case Intrinsic::amdgcn_raw_buffer_store_format:
4334   case Intrinsic::amdgcn_struct_buffer_store_format:
4335     return legalizeBufferStore(MI, MRI, B, false, true);
4336   case Intrinsic::amdgcn_raw_tbuffer_store:
4337   case Intrinsic::amdgcn_struct_tbuffer_store:
4338     return legalizeBufferStore(MI, MRI, B, true, true);
4339   case Intrinsic::amdgcn_raw_buffer_load:
4340   case Intrinsic::amdgcn_struct_buffer_load:
4341     return legalizeBufferLoad(MI, MRI, B, false, false);
4342   case Intrinsic::amdgcn_raw_buffer_load_format:
4343   case Intrinsic::amdgcn_struct_buffer_load_format:
4344     return legalizeBufferLoad(MI, MRI, B, true, false);
4345   case Intrinsic::amdgcn_raw_tbuffer_load:
4346   case Intrinsic::amdgcn_struct_tbuffer_load:
4347     return legalizeBufferLoad(MI, MRI, B, true, true);
4348   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4349   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4350   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4351   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4352   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4353   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4354   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4355   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4356   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4357   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4358   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4359   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4360   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4361   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4362   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4363   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4364   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4365   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4366   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4367   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4368   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4369   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4370   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4371   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4372   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4373   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4374     return legalizeBufferAtomic(MI, B, IntrID);
4375   case Intrinsic::amdgcn_atomic_inc:
4376     return legalizeAtomicIncDec(MI, B, true);
4377   case Intrinsic::amdgcn_atomic_dec:
4378     return legalizeAtomicIncDec(MI, B, false);
4379   case Intrinsic::trap:
4380     return legalizeTrapIntrinsic(MI, MRI, B);
4381   case Intrinsic::debugtrap:
4382     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4383   default: {
4384     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4385             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4386       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4387     return true;
4388   }
4389   }
4390 
4391   return true;
4392 }
4393