1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .legalIf(isPointer(0))
419     .clampScalar(0, S32, S256)
420     .widenScalarToNextPow2(0, 32)
421     .clampMaxNumElements(0, S32, 16)
422     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
423     .scalarize(0);
424 
425   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
426     // Full set of gfx9 features.
427     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
428       .legalFor({S32, S16, V2S16})
429       .clampScalar(0, S16, S32)
430       .clampMaxNumElements(0, S16, 2)
431       .scalarize(0)
432       .widenScalarToNextPow2(0, 32);
433 
434     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
435       .legalFor({S32, S16, V2S16}) // Clamp modifier
436       .minScalar(0, S16)
437       .clampMaxNumElements(0, S16, 2)
438       .scalarize(0)
439       .widenScalarToNextPow2(0, 32)
440       .lower();
441   } else if (ST.has16BitInsts()) {
442     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
443       .legalFor({S32, S16})
444       .clampScalar(0, S16, S32)
445       .scalarize(0)
446       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
447 
448     // Technically the saturating operations require clamp bit support, but this
449     // was introduced at the same time as 16-bit operations.
450     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
451       .legalFor({S32, S16}) // Clamp modifier
452       .minScalar(0, S16)
453       .scalarize(0)
454       .widenScalarToNextPow2(0, 16)
455       .lower();
456 
457     // We're just lowering this, but it helps get a better result to try to
458     // coerce to the desired type first.
459     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
460       .minScalar(0, S16)
461       .scalarize(0)
462       .lower();
463   } else {
464     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
465       .legalFor({S32})
466       .clampScalar(0, S32, S32)
467       .scalarize(0);
468 
469     if (ST.hasIntClamp()) {
470       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
471         .legalFor({S32}) // Clamp modifier.
472         .scalarize(0)
473         .minScalarOrElt(0, S32)
474         .lower();
475     } else {
476       // Clamp bit support was added in VI, along with 16-bit operations.
477       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
478         .minScalar(0, S32)
479         .scalarize(0)
480         .lower();
481     }
482 
483     // FIXME: DAG expansion gets better results. The widening uses the smaller
484     // range values and goes for the min/max lowering directly.
485     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
486       .minScalar(0, S32)
487       .scalarize(0)
488       .lower();
489   }
490 
491   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
492     .customFor({S32, S64})
493     .clampScalar(0, S32, S64)
494     .widenScalarToNextPow2(0, 32)
495     .scalarize(0);
496 
497   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
498     .legalFor({S32})
499     .clampScalar(0, S32, S32)
500     .scalarize(0);
501 
502   // Report legal for any types we can handle anywhere. For the cases only legal
503   // on the SALU, RegBankSelect will be able to re-legalize.
504   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
505     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
506     .clampScalar(0, S32, S64)
507     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
508     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
509     .widenScalarToNextPow2(0)
510     .scalarize(0);
511 
512   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
513                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
514     .legalFor({{S32, S1}, {S32, S32}})
515     .minScalar(0, S32)
516     // TODO: .scalarize(0)
517     .lower();
518 
519   getActionDefinitionsBuilder(G_BITCAST)
520     // Don't worry about the size constraint.
521     .legalIf(all(isRegisterType(0), isRegisterType(1)))
522     .lower();
523 
524 
525   getActionDefinitionsBuilder(G_CONSTANT)
526     .legalFor({S1, S32, S64, S16, GlobalPtr,
527                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
528     .legalIf(isPointer(0))
529     .clampScalar(0, S32, S64)
530     .widenScalarToNextPow2(0);
531 
532   getActionDefinitionsBuilder(G_FCONSTANT)
533     .legalFor({S32, S64, S16})
534     .clampScalar(0, S16, S64);
535 
536   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
537       .legalIf(isRegisterType(0))
538       // s1 and s16 are special cases because they have legal operations on
539       // them, but don't really occupy registers in the normal way.
540       .legalFor({S1, S16})
541       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
542       .clampScalarOrElt(0, S32, MaxScalar)
543       .widenScalarToNextPow2(0, 32)
544       .clampMaxNumElements(0, S32, 16);
545 
546   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
547 
548   // If the amount is divergent, we have to do a wave reduction to get the
549   // maximum value, so this is expanded during RegBankSelect.
550   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
551     .legalFor({{PrivatePtr, S32}});
552 
553   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
554     .customIf(typeIsNot(0, PrivatePtr));
555 
556   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
557 
558   auto &FPOpActions = getActionDefinitionsBuilder(
559     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
560     .legalFor({S32, S64});
561   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
562     .customFor({S32, S64});
563   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
564     .customFor({S32, S64});
565 
566   if (ST.has16BitInsts()) {
567     if (ST.hasVOP3PInsts())
568       FPOpActions.legalFor({S16, V2S16});
569     else
570       FPOpActions.legalFor({S16});
571 
572     TrigActions.customFor({S16});
573     FDIVActions.customFor({S16});
574   }
575 
576   auto &MinNumMaxNum = getActionDefinitionsBuilder({
577       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
578 
579   if (ST.hasVOP3PInsts()) {
580     MinNumMaxNum.customFor(FPTypesPK16)
581       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
582       .clampMaxNumElements(0, S16, 2)
583       .clampScalar(0, S16, S64)
584       .scalarize(0);
585   } else if (ST.has16BitInsts()) {
586     MinNumMaxNum.customFor(FPTypes16)
587       .clampScalar(0, S16, S64)
588       .scalarize(0);
589   } else {
590     MinNumMaxNum.customFor(FPTypesBase)
591       .clampScalar(0, S32, S64)
592       .scalarize(0);
593   }
594 
595   if (ST.hasVOP3PInsts())
596     FPOpActions.clampMaxNumElements(0, S16, 2);
597 
598   FPOpActions
599     .scalarize(0)
600     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
601 
602   TrigActions
603     .scalarize(0)
604     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
605 
606   FDIVActions
607     .scalarize(0)
608     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
609 
610   getActionDefinitionsBuilder({G_FNEG, G_FABS})
611     .legalFor(FPTypesPK16)
612     .clampMaxNumElements(0, S16, 2)
613     .scalarize(0)
614     .clampScalar(0, S16, S64);
615 
616   if (ST.has16BitInsts()) {
617     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
618       .legalFor({S32, S64, S16})
619       .scalarize(0)
620       .clampScalar(0, S16, S64);
621   } else {
622     getActionDefinitionsBuilder(G_FSQRT)
623       .legalFor({S32, S64})
624       .scalarize(0)
625       .clampScalar(0, S32, S64);
626 
627     if (ST.hasFractBug()) {
628       getActionDefinitionsBuilder(G_FFLOOR)
629         .customFor({S64})
630         .legalFor({S32, S64})
631         .scalarize(0)
632         .clampScalar(0, S32, S64);
633     } else {
634       getActionDefinitionsBuilder(G_FFLOOR)
635         .legalFor({S32, S64})
636         .scalarize(0)
637         .clampScalar(0, S32, S64);
638     }
639   }
640 
641   getActionDefinitionsBuilder(G_FPTRUNC)
642     .legalFor({{S32, S64}, {S16, S32}})
643     .scalarize(0)
644     .lower();
645 
646   getActionDefinitionsBuilder(G_FPEXT)
647     .legalFor({{S64, S32}, {S32, S16}})
648     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
649     .scalarize(0);
650 
651   getActionDefinitionsBuilder(G_FSUB)
652       // Use actual fsub instruction
653       .legalFor({S32})
654       // Must use fadd + fneg
655       .lowerFor({S64, S16, V2S16})
656       .scalarize(0)
657       .clampScalar(0, S32, S64);
658 
659   // Whether this is legal depends on the floating point mode for the function.
660   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
661   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
662     FMad.customFor({S32, S16});
663   else if (ST.hasMadMacF32Insts())
664     FMad.customFor({S32});
665   else if (ST.hasMadF16())
666     FMad.customFor({S16});
667   FMad.scalarize(0)
668       .lower();
669 
670   // TODO: Do we need to clamp maximum bitwidth?
671   getActionDefinitionsBuilder(G_TRUNC)
672     .legalIf(isScalar(0))
673     .legalFor({{V2S16, V2S32}})
674     .clampMaxNumElements(0, S16, 2)
675     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
676     // situations (like an invalid implicit use), we don't want to infinite loop
677     // in the legalizer.
678     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
679     .alwaysLegal();
680 
681   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
682     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
683                {S32, S1}, {S64, S1}, {S16, S1}})
684     .scalarize(0)
685     .clampScalar(0, S32, S64)
686     .widenScalarToNextPow2(1, 32);
687 
688   // TODO: Split s1->s64 during regbankselect for VALU.
689   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
690     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
691     .lowerFor({{S32, S64}})
692     .lowerIf(typeIs(1, S1))
693     .customFor({{S64, S64}});
694   if (ST.has16BitInsts())
695     IToFP.legalFor({{S16, S16}});
696   IToFP.clampScalar(1, S32, S64)
697        .minScalar(0, S32)
698        .scalarize(0)
699        .widenScalarToNextPow2(1);
700 
701   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
702     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
703     .customFor({{S64, S64}})
704     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
705   if (ST.has16BitInsts())
706     FPToI.legalFor({{S16, S16}});
707   else
708     FPToI.minScalar(1, S32);
709 
710   FPToI.minScalar(0, S32)
711        .scalarize(0)
712        .lower();
713 
714   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
715     .scalarize(0)
716     .lower();
717 
718   if (ST.has16BitInsts()) {
719     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
720       .legalFor({S16, S32, S64})
721       .clampScalar(0, S16, S64)
722       .scalarize(0);
723   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
724     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
725       .legalFor({S32, S64})
726       .clampScalar(0, S32, S64)
727       .scalarize(0);
728   } else {
729     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
730       .legalFor({S32})
731       .customFor({S64})
732       .clampScalar(0, S32, S64)
733       .scalarize(0);
734   }
735 
736   getActionDefinitionsBuilder(G_PTR_ADD)
737     .legalIf(all(isPointer(0), sameSize(0, 1)))
738     .scalarize(0)
739     .scalarSameSizeAs(1, 0);
740 
741   getActionDefinitionsBuilder(G_PTRMASK)
742     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
743     .scalarSameSizeAs(1, 0)
744     .scalarize(0);
745 
746   auto &CmpBuilder =
747     getActionDefinitionsBuilder(G_ICMP)
748     // The compare output type differs based on the register bank of the output,
749     // so make both s1 and s32 legal.
750     //
751     // Scalar compares producing output in scc will be promoted to s32, as that
752     // is the allocatable register type that will be needed for the copy from
753     // scc. This will be promoted during RegBankSelect, and we assume something
754     // before that won't try to use s32 result types.
755     //
756     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
757     // bank.
758     .legalForCartesianProduct(
759       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
760     .legalForCartesianProduct(
761       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
762   if (ST.has16BitInsts()) {
763     CmpBuilder.legalFor({{S1, S16}});
764   }
765 
766   CmpBuilder
767     .widenScalarToNextPow2(1)
768     .clampScalar(1, S32, S64)
769     .scalarize(0)
770     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
771 
772   getActionDefinitionsBuilder(G_FCMP)
773     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
774     .widenScalarToNextPow2(1)
775     .clampScalar(1, S32, S64)
776     .scalarize(0);
777 
778   // FIXME: fpow has a selection pattern that should move to custom lowering.
779   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
780   if (ST.has16BitInsts())
781     Exp2Ops.legalFor({S32, S16});
782   else
783     Exp2Ops.legalFor({S32});
784   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
785   Exp2Ops.scalarize(0);
786 
787   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
788   if (ST.has16BitInsts())
789     ExpOps.customFor({{S32}, {S16}});
790   else
791     ExpOps.customFor({S32});
792   ExpOps.clampScalar(0, MinScalarFPTy, S32)
793         .scalarize(0);
794 
795   getActionDefinitionsBuilder(G_FPOWI)
796     .clampScalar(0, MinScalarFPTy, S32)
797     .lower();
798 
799   // The 64-bit versions produce 32-bit results, but only on the SALU.
800   getActionDefinitionsBuilder(G_CTPOP)
801     .legalFor({{S32, S32}, {S32, S64}})
802     .clampScalar(0, S32, S32)
803     .clampScalar(1, S32, S64)
804     .scalarize(0)
805     .widenScalarToNextPow2(0, 32)
806     .widenScalarToNextPow2(1, 32);
807 
808   // The hardware instructions return a different result on 0 than the generic
809   // instructions expect. The hardware produces -1, but these produce the
810   // bitwidth.
811   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
812     .scalarize(0)
813     .clampScalar(0, S32, S32)
814     .clampScalar(1, S32, S64)
815     .widenScalarToNextPow2(0, 32)
816     .widenScalarToNextPow2(1, 32)
817     .lower();
818 
819   // The 64-bit versions produce 32-bit results, but only on the SALU.
820   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
821     .legalFor({{S32, S32}, {S32, S64}})
822     .clampScalar(0, S32, S32)
823     .clampScalar(1, S32, S64)
824     .scalarize(0)
825     .widenScalarToNextPow2(0, 32)
826     .widenScalarToNextPow2(1, 32);
827 
828   getActionDefinitionsBuilder(G_BITREVERSE)
829     .legalFor({S32})
830     .clampScalar(0, S32, S32)
831     .scalarize(0);
832 
833   if (ST.has16BitInsts()) {
834     getActionDefinitionsBuilder(G_BSWAP)
835       .legalFor({S16, S32, V2S16})
836       .clampMaxNumElements(0, S16, 2)
837       // FIXME: Fixing non-power-of-2 before clamp is workaround for
838       // narrowScalar limitation.
839       .widenScalarToNextPow2(0)
840       .clampScalar(0, S16, S32)
841       .scalarize(0);
842 
843     if (ST.hasVOP3PInsts()) {
844       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
845         .legalFor({S32, S16, V2S16})
846         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
847         .clampMaxNumElements(0, S16, 2)
848         .minScalar(0, S16)
849         .widenScalarToNextPow2(0)
850         .scalarize(0)
851         .lower();
852     } else {
853       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
854         .legalFor({S32, S16})
855         .widenScalarToNextPow2(0)
856         .minScalar(0, S16)
857         .scalarize(0)
858         .lower();
859     }
860   } else {
861     // TODO: Should have same legality without v_perm_b32
862     getActionDefinitionsBuilder(G_BSWAP)
863       .legalFor({S32})
864       .lowerIf(scalarNarrowerThan(0, 32))
865       // FIXME: Fixing non-power-of-2 before clamp is workaround for
866       // narrowScalar limitation.
867       .widenScalarToNextPow2(0)
868       .maxScalar(0, S32)
869       .scalarize(0)
870       .lower();
871 
872     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
873       .legalFor({S32})
874       .minScalar(0, S32)
875       .widenScalarToNextPow2(0)
876       .scalarize(0)
877       .lower();
878   }
879 
880   getActionDefinitionsBuilder(G_INTTOPTR)
881     // List the common cases
882     .legalForCartesianProduct(AddrSpaces64, {S64})
883     .legalForCartesianProduct(AddrSpaces32, {S32})
884     .scalarize(0)
885     // Accept any address space as long as the size matches
886     .legalIf(sameSize(0, 1))
887     .widenScalarIf(smallerThan(1, 0),
888       [](const LegalityQuery &Query) {
889         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
890       })
891     .narrowScalarIf(largerThan(1, 0),
892       [](const LegalityQuery &Query) {
893         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
894       });
895 
896   getActionDefinitionsBuilder(G_PTRTOINT)
897     // List the common cases
898     .legalForCartesianProduct(AddrSpaces64, {S64})
899     .legalForCartesianProduct(AddrSpaces32, {S32})
900     .scalarize(0)
901     // Accept any address space as long as the size matches
902     .legalIf(sameSize(0, 1))
903     .widenScalarIf(smallerThan(0, 1),
904       [](const LegalityQuery &Query) {
905         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
906       })
907     .narrowScalarIf(
908       largerThan(0, 1),
909       [](const LegalityQuery &Query) {
910         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
911       });
912 
913   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
914     .scalarize(0)
915     .custom();
916 
917   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
918                                     bool IsLoad) -> bool {
919     const LLT DstTy = Query.Types[0];
920 
921     // Split vector extloads.
922     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
923     unsigned Align = Query.MMODescrs[0].AlignInBits;
924 
925     if (MemSize < DstTy.getSizeInBits())
926       MemSize = std::max(MemSize, Align);
927 
928     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
929       return true;
930 
931     const LLT PtrTy = Query.Types[1];
932     unsigned AS = PtrTy.getAddressSpace();
933     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
934       return true;
935 
936     // Catch weird sized loads that don't evenly divide into the access sizes
937     // TODO: May be able to widen depending on alignment etc.
938     unsigned NumRegs = (MemSize + 31) / 32;
939     if (NumRegs == 3) {
940       if (!ST.hasDwordx3LoadStores())
941         return true;
942     } else {
943       // If the alignment allows, these should have been widened.
944       if (!isPowerOf2_32(NumRegs))
945         return true;
946     }
947 
948     if (Align < MemSize) {
949       const SITargetLowering *TLI = ST.getTargetLowering();
950       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
951     }
952 
953     return false;
954   };
955 
956   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
957                                          unsigned Opc) -> bool {
958     unsigned Size = Query.Types[0].getSizeInBits();
959     if (isPowerOf2_32(Size))
960       return false;
961 
962     if (Size == 96 && ST.hasDwordx3LoadStores())
963       return false;
964 
965     unsigned AddrSpace = Query.Types[1].getAddressSpace();
966     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
967       return false;
968 
969     unsigned Align = Query.MMODescrs[0].AlignInBits;
970     unsigned RoundedSize = NextPowerOf2(Size);
971     return (Align >= RoundedSize);
972   };
973 
974   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
975   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
976   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
977 
978   // TODO: Refine based on subtargets which support unaligned access or 128-bit
979   // LDS
980   // TODO: Unsupported flat for SI.
981 
982   for (unsigned Op : {G_LOAD, G_STORE}) {
983     const bool IsStore = Op == G_STORE;
984 
985     auto &Actions = getActionDefinitionsBuilder(Op);
986     // Explicitly list some common cases.
987     // TODO: Does this help compile time at all?
988     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
989                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
990                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
991                                       {S64, GlobalPtr, 64, GlobalAlign32},
992                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
993                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
994                                       {S32, GlobalPtr, 8, GlobalAlign8},
995                                       {S32, GlobalPtr, 16, GlobalAlign16},
996 
997                                       {S32, LocalPtr, 32, 32},
998                                       {S64, LocalPtr, 64, 32},
999                                       {V2S32, LocalPtr, 64, 32},
1000                                       {S32, LocalPtr, 8, 8},
1001                                       {S32, LocalPtr, 16, 16},
1002                                       {V2S16, LocalPtr, 32, 32},
1003 
1004                                       {S32, PrivatePtr, 32, 32},
1005                                       {S32, PrivatePtr, 8, 8},
1006                                       {S32, PrivatePtr, 16, 16},
1007                                       {V2S16, PrivatePtr, 32, 32},
1008 
1009                                       {S32, ConstantPtr, 32, GlobalAlign32},
1010                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1011                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1012                                       {S64, ConstantPtr, 64, GlobalAlign32},
1013                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1014     Actions.legalIf(
1015       [=](const LegalityQuery &Query) -> bool {
1016         return isLoadStoreLegal(ST, Query, Op);
1017       });
1018 
1019     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1020     // 64-bits.
1021     //
1022     // TODO: Should generalize bitcast action into coerce, which will also cover
1023     // inserting addrspacecasts.
1024     Actions.customIf(typeIs(1, Constant32Ptr));
1025 
1026     // Turn any illegal element vectors into something easier to deal
1027     // with. These will ultimately produce 32-bit scalar shifts to extract the
1028     // parts anyway.
1029     //
1030     // For odd 16-bit element vectors, prefer to split those into pieces with
1031     // 16-bit vector parts.
1032     Actions.bitcastIf(
1033       [=](const LegalityQuery &Query) -> bool {
1034         const LLT Ty = Query.Types[0];
1035         const unsigned Size = Ty.getSizeInBits();
1036 
1037         if (Size != Query.MMODescrs[0].SizeInBits)
1038           return Size <= 32 && Ty.isVector();
1039 
1040         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
1041           return true;
1042         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
1043                !isRegisterVectorElementType(Ty.getElementType());
1044       }, bitcastToRegisterType(0));
1045 
1046     Actions
1047         .customIf(typeIs(1, Constant32Ptr))
1048         // Widen suitably aligned loads by loading extra elements.
1049         .moreElementsIf([=](const LegalityQuery &Query) {
1050             const LLT Ty = Query.Types[0];
1051             return Op == G_LOAD && Ty.isVector() &&
1052                    shouldWidenLoadResult(Query, Op);
1053           }, moreElementsToNextPow2(0))
1054         .widenScalarIf([=](const LegalityQuery &Query) {
1055             const LLT Ty = Query.Types[0];
1056             return Op == G_LOAD && !Ty.isVector() &&
1057                    shouldWidenLoadResult(Query, Op);
1058           }, widenScalarOrEltToNextPow2(0))
1059         .narrowScalarIf(
1060             [=](const LegalityQuery &Query) -> bool {
1061               return !Query.Types[0].isVector() &&
1062                      needToSplitMemOp(Query, Op == G_LOAD);
1063             },
1064             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1065               const LLT DstTy = Query.Types[0];
1066               const LLT PtrTy = Query.Types[1];
1067 
1068               const unsigned DstSize = DstTy.getSizeInBits();
1069               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1070 
1071               // Split extloads.
1072               if (DstSize > MemSize)
1073                 return std::make_pair(0, LLT::scalar(MemSize));
1074 
1075               if (!isPowerOf2_32(DstSize)) {
1076                 // We're probably decomposing an odd sized store. Try to split
1077                 // to the widest type. TODO: Account for alignment. As-is it
1078                 // should be OK, since the new parts will be further legalized.
1079                 unsigned FloorSize = PowerOf2Floor(DstSize);
1080                 return std::make_pair(0, LLT::scalar(FloorSize));
1081               }
1082 
1083               if (DstSize > 32 && (DstSize % 32 != 0)) {
1084                 // FIXME: Need a way to specify non-extload of larger size if
1085                 // suitably aligned.
1086                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1087               }
1088 
1089               unsigned MaxSize = maxSizeForAddrSpace(ST,
1090                                                      PtrTy.getAddressSpace(),
1091                                                      Op == G_LOAD);
1092               if (MemSize > MaxSize)
1093                 return std::make_pair(0, LLT::scalar(MaxSize));
1094 
1095               unsigned Align = Query.MMODescrs[0].AlignInBits;
1096               return std::make_pair(0, LLT::scalar(Align));
1097             })
1098         .fewerElementsIf(
1099             [=](const LegalityQuery &Query) -> bool {
1100               return Query.Types[0].isVector() &&
1101                      needToSplitMemOp(Query, Op == G_LOAD);
1102             },
1103             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1104               const LLT DstTy = Query.Types[0];
1105               const LLT PtrTy = Query.Types[1];
1106 
1107               LLT EltTy = DstTy.getElementType();
1108               unsigned MaxSize = maxSizeForAddrSpace(ST,
1109                                                      PtrTy.getAddressSpace(),
1110                                                      Op == G_LOAD);
1111 
1112               // FIXME: Handle widened to power of 2 results better. This ends
1113               // up scalarizing.
1114               // FIXME: 3 element stores scalarized on SI
1115 
1116               // Split if it's too large for the address space.
1117               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1118                 unsigned NumElts = DstTy.getNumElements();
1119                 unsigned EltSize = EltTy.getSizeInBits();
1120 
1121                 if (MaxSize % EltSize == 0) {
1122                   return std::make_pair(
1123                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1124                 }
1125 
1126                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1127 
1128                 // FIXME: Refine when odd breakdowns handled
1129                 // The scalars will need to be re-legalized.
1130                 if (NumPieces == 1 || NumPieces >= NumElts ||
1131                     NumElts % NumPieces != 0)
1132                   return std::make_pair(0, EltTy);
1133 
1134                 return std::make_pair(0,
1135                                       LLT::vector(NumElts / NumPieces, EltTy));
1136               }
1137 
1138               // FIXME: We could probably handle weird extending loads better.
1139               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1140               if (DstTy.getSizeInBits() > MemSize)
1141                 return std::make_pair(0, EltTy);
1142 
1143               unsigned EltSize = EltTy.getSizeInBits();
1144               unsigned DstSize = DstTy.getSizeInBits();
1145               if (!isPowerOf2_32(DstSize)) {
1146                 // We're probably decomposing an odd sized store. Try to split
1147                 // to the widest type. TODO: Account for alignment. As-is it
1148                 // should be OK, since the new parts will be further legalized.
1149                 unsigned FloorSize = PowerOf2Floor(DstSize);
1150                 return std::make_pair(
1151                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1152               }
1153 
1154               // Need to split because of alignment.
1155               unsigned Align = Query.MMODescrs[0].AlignInBits;
1156               if (EltSize > Align &&
1157                   (EltSize / Align < DstTy.getNumElements())) {
1158                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1159               }
1160 
1161               // May need relegalization for the scalars.
1162               return std::make_pair(0, EltTy);
1163             })
1164         .minScalar(0, S32);
1165 
1166     if (IsStore)
1167       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1168 
1169     // TODO: Need a bitcast lower option?
1170     Actions
1171         .widenScalarToNextPow2(0)
1172         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1173   }
1174 
1175   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1176                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1177                                                   {S32, GlobalPtr, 16, 2 * 8},
1178                                                   {S32, LocalPtr, 8, 8},
1179                                                   {S32, LocalPtr, 16, 16},
1180                                                   {S32, PrivatePtr, 8, 8},
1181                                                   {S32, PrivatePtr, 16, 16},
1182                                                   {S32, ConstantPtr, 8, 8},
1183                                                   {S32, ConstantPtr, 16, 2 * 8}});
1184   if (ST.hasFlatAddressSpace()) {
1185     ExtLoads.legalForTypesWithMemDesc(
1186         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1187   }
1188 
1189   ExtLoads.clampScalar(0, S32, S32)
1190           .widenScalarToNextPow2(0)
1191           .unsupportedIfMemSizeNotPow2()
1192           .lower();
1193 
1194   auto &Atomics = getActionDefinitionsBuilder(
1195     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1196      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1197      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1198      G_ATOMICRMW_UMIN})
1199     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1200                {S64, GlobalPtr}, {S64, LocalPtr},
1201                {S32, RegionPtr}, {S64, RegionPtr}});
1202   if (ST.hasFlatAddressSpace()) {
1203     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1204   }
1205 
1206   if (ST.hasLDSFPAtomics()) {
1207     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1208       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1209   }
1210 
1211   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1212   // demarshalling
1213   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1214     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1215                 {S32, FlatPtr}, {S64, FlatPtr}})
1216     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1217                {S32, RegionPtr}, {S64, RegionPtr}});
1218   // TODO: Pointer types, any 32-bit or 64-bit vector
1219 
1220   // Condition should be s32 for scalar, s1 for vector.
1221   getActionDefinitionsBuilder(G_SELECT)
1222     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1223           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1224           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1225     .clampScalar(0, S16, S64)
1226     .scalarize(1)
1227     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1228     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1229     .clampMaxNumElements(0, S32, 2)
1230     .clampMaxNumElements(0, LocalPtr, 2)
1231     .clampMaxNumElements(0, PrivatePtr, 2)
1232     .scalarize(0)
1233     .widenScalarToNextPow2(0)
1234     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1235 
1236   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1237   // be more flexible with the shift amount type.
1238   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1239     .legalFor({{S32, S32}, {S64, S32}});
1240   if (ST.has16BitInsts()) {
1241     if (ST.hasVOP3PInsts()) {
1242       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1243             .clampMaxNumElements(0, S16, 2);
1244     } else
1245       Shifts.legalFor({{S16, S16}});
1246 
1247     // TODO: Support 16-bit shift amounts for all types
1248     Shifts.widenScalarIf(
1249       [=](const LegalityQuery &Query) {
1250         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1251         // 32-bit amount.
1252         const LLT ValTy = Query.Types[0];
1253         const LLT AmountTy = Query.Types[1];
1254         return ValTy.getSizeInBits() <= 16 &&
1255                AmountTy.getSizeInBits() < 16;
1256       }, changeTo(1, S16));
1257     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1258     Shifts.clampScalar(1, S32, S32);
1259     Shifts.clampScalar(0, S16, S64);
1260     Shifts.widenScalarToNextPow2(0, 16);
1261   } else {
1262     // Make sure we legalize the shift amount type first, as the general
1263     // expansion for the shifted type will produce much worse code if it hasn't
1264     // been truncated already.
1265     Shifts.clampScalar(1, S32, S32);
1266     Shifts.clampScalar(0, S32, S64);
1267     Shifts.widenScalarToNextPow2(0, 32);
1268   }
1269   Shifts.scalarize(0);
1270 
1271   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1272     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1273     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1274     unsigned IdxTypeIdx = 2;
1275 
1276     getActionDefinitionsBuilder(Op)
1277       .customIf([=](const LegalityQuery &Query) {
1278           const LLT EltTy = Query.Types[EltTypeIdx];
1279           const LLT VecTy = Query.Types[VecTypeIdx];
1280           const LLT IdxTy = Query.Types[IdxTypeIdx];
1281           return (EltTy.getSizeInBits() == 16 ||
1282                   EltTy.getSizeInBits() % 32 == 0) &&
1283                  VecTy.getSizeInBits() % 32 == 0 &&
1284                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1285                  IdxTy.getSizeInBits() == 32;
1286         })
1287       .clampScalar(EltTypeIdx, S32, S64)
1288       .clampScalar(VecTypeIdx, S32, S64)
1289       .clampScalar(IdxTypeIdx, S32, S32)
1290       // TODO: Clamp the number of elements before resorting to stack lowering.
1291       // It should only be necessary with variable indexes.
1292       // As a last resort, lower to the stack
1293       .lower();
1294   }
1295 
1296   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1297     .unsupportedIf([=](const LegalityQuery &Query) {
1298         const LLT &EltTy = Query.Types[1].getElementType();
1299         return Query.Types[0] != EltTy;
1300       });
1301 
1302   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1303     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1304     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1305 
1306     // FIXME: Doesn't handle extract of illegal sizes.
1307     getActionDefinitionsBuilder(Op)
1308       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1309       // FIXME: Multiples of 16 should not be legal.
1310       .legalIf([=](const LegalityQuery &Query) {
1311           const LLT BigTy = Query.Types[BigTyIdx];
1312           const LLT LitTy = Query.Types[LitTyIdx];
1313           return (BigTy.getSizeInBits() % 32 == 0) &&
1314                  (LitTy.getSizeInBits() % 16 == 0);
1315         })
1316       .widenScalarIf(
1317         [=](const LegalityQuery &Query) {
1318           const LLT BigTy = Query.Types[BigTyIdx];
1319           return (BigTy.getScalarSizeInBits() < 16);
1320         },
1321         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1322       .widenScalarIf(
1323         [=](const LegalityQuery &Query) {
1324           const LLT LitTy = Query.Types[LitTyIdx];
1325           return (LitTy.getScalarSizeInBits() < 16);
1326         },
1327         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1328       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1329       .widenScalarToNextPow2(BigTyIdx, 32);
1330 
1331   }
1332 
1333   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1334     .legalForCartesianProduct(AllS32Vectors, {S32})
1335     .legalForCartesianProduct(AllS64Vectors, {S64})
1336     .clampNumElements(0, V16S32, V32S32)
1337     .clampNumElements(0, V2S64, V16S64)
1338     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1339 
1340   if (ST.hasScalarPackInsts()) {
1341     BuildVector
1342       // FIXME: Should probably widen s1 vectors straight to s32
1343       .minScalarOrElt(0, S16)
1344       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1345       .minScalar(1, S32);
1346 
1347     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1348       .legalFor({V2S16, S32})
1349       .lower();
1350     BuildVector.minScalarOrElt(0, S32);
1351   } else {
1352     BuildVector.customFor({V2S16, S16});
1353     BuildVector.minScalarOrElt(0, S32);
1354 
1355     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1356       .customFor({V2S16, S32})
1357       .lower();
1358   }
1359 
1360   BuildVector.legalIf(isRegisterType(0));
1361 
1362   // FIXME: Clamp maximum size
1363   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1364     .legalIf(isRegisterType(0));
1365 
1366   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1367   // pre-legalize.
1368   if (ST.hasVOP3PInsts()) {
1369     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1370       .customFor({V2S16, V2S16})
1371       .lower();
1372   } else
1373     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1374 
1375   // Merge/Unmerge
1376   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1377     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1378     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1379 
1380     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1381       const LLT Ty = Query.Types[TypeIdx];
1382       if (Ty.isVector()) {
1383         const LLT &EltTy = Ty.getElementType();
1384         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1385           return true;
1386         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1387           return true;
1388       }
1389       return false;
1390     };
1391 
1392     auto &Builder = getActionDefinitionsBuilder(Op)
1393       .lowerFor({{S16, V2S16}})
1394       .lowerIf([=](const LegalityQuery &Query) {
1395           const LLT BigTy = Query.Types[BigTyIdx];
1396           return BigTy.getSizeInBits() == 32;
1397         })
1398       // Try to widen to s16 first for small types.
1399       // TODO: Only do this on targets with legal s16 shifts
1400       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1401       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1402       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1403       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1404                            elementTypeIs(1, S16)),
1405                        changeTo(1, V2S16))
1406       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1407       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1408       // valid.
1409       .clampScalar(LitTyIdx, S32, S512)
1410       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1411       // Break up vectors with weird elements into scalars
1412       .fewerElementsIf(
1413         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1414         scalarize(0))
1415       .fewerElementsIf(
1416         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1417         scalarize(1))
1418       .clampScalar(BigTyIdx, S32, MaxScalar);
1419 
1420     if (Op == G_MERGE_VALUES) {
1421       Builder.widenScalarIf(
1422         // TODO: Use 16-bit shifts if legal for 8-bit values?
1423         [=](const LegalityQuery &Query) {
1424           const LLT Ty = Query.Types[LitTyIdx];
1425           return Ty.getSizeInBits() < 32;
1426         },
1427         changeTo(LitTyIdx, S32));
1428     }
1429 
1430     Builder.widenScalarIf(
1431       [=](const LegalityQuery &Query) {
1432         const LLT Ty = Query.Types[BigTyIdx];
1433         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1434           Ty.getSizeInBits() % 16 != 0;
1435       },
1436       [=](const LegalityQuery &Query) {
1437         // Pick the next power of 2, or a multiple of 64 over 128.
1438         // Whichever is smaller.
1439         const LLT &Ty = Query.Types[BigTyIdx];
1440         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1441         if (NewSizeInBits >= 256) {
1442           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1443           if (RoundedTo < NewSizeInBits)
1444             NewSizeInBits = RoundedTo;
1445         }
1446         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1447       })
1448       .legalIf([=](const LegalityQuery &Query) {
1449           const LLT &BigTy = Query.Types[BigTyIdx];
1450           const LLT &LitTy = Query.Types[LitTyIdx];
1451 
1452           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1453             return false;
1454           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1455             return false;
1456 
1457           return BigTy.getSizeInBits() % 16 == 0 &&
1458                  LitTy.getSizeInBits() % 16 == 0 &&
1459                  BigTy.getSizeInBits() <= MaxRegisterSize;
1460         })
1461       // Any vectors left are the wrong size. Scalarize them.
1462       .scalarize(0)
1463       .scalarize(1);
1464   }
1465 
1466   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1467   // RegBankSelect.
1468   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1469     .legalFor({{S32}, {S64}});
1470 
1471   if (ST.hasVOP3PInsts()) {
1472     SextInReg.lowerFor({{V2S16}})
1473       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1474       // get more vector shift opportunities, since we'll get those when
1475       // expanded.
1476       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1477   } else if (ST.has16BitInsts()) {
1478     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1479   } else {
1480     // Prefer to promote to s32 before lowering if we don't have 16-bit
1481     // shifts. This avoid a lot of intermediate truncate and extend operations.
1482     SextInReg.lowerFor({{S32}, {S64}});
1483   }
1484 
1485   SextInReg
1486     .scalarize(0)
1487     .clampScalar(0, S32, S64)
1488     .lower();
1489 
1490   getActionDefinitionsBuilder(G_FSHR)
1491     .legalFor({{S32, S32}})
1492     .scalarize(0)
1493     .lower();
1494 
1495   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1496     .legalFor({S64});
1497 
1498   getActionDefinitionsBuilder(G_FENCE)
1499     .alwaysLegal();
1500 
1501   getActionDefinitionsBuilder({
1502       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1503       G_FCOPYSIGN,
1504 
1505       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1506       G_ATOMICRMW_NAND,
1507       G_ATOMICRMW_FSUB,
1508       G_READ_REGISTER,
1509       G_WRITE_REGISTER,
1510 
1511       G_SADDO, G_SSUBO,
1512 
1513        // TODO: Implement
1514       G_FMINIMUM, G_FMAXIMUM,
1515       G_FSHL
1516     }).lower();
1517 
1518   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1519         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1520         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1521     .unsupported();
1522 
1523   computeTables();
1524   verify(*ST.getInstrInfo());
1525 }
1526 
1527 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1528                                          MachineInstr &MI) const {
1529   MachineIRBuilder &B = Helper.MIRBuilder;
1530   MachineRegisterInfo &MRI = *B.getMRI();
1531   GISelChangeObserver &Observer = Helper.Observer;
1532 
1533   switch (MI.getOpcode()) {
1534   case TargetOpcode::G_ADDRSPACE_CAST:
1535     return legalizeAddrSpaceCast(MI, MRI, B);
1536   case TargetOpcode::G_FRINT:
1537     return legalizeFrint(MI, MRI, B);
1538   case TargetOpcode::G_FCEIL:
1539     return legalizeFceil(MI, MRI, B);
1540   case TargetOpcode::G_INTRINSIC_TRUNC:
1541     return legalizeIntrinsicTrunc(MI, MRI, B);
1542   case TargetOpcode::G_SITOFP:
1543     return legalizeITOFP(MI, MRI, B, true);
1544   case TargetOpcode::G_UITOFP:
1545     return legalizeITOFP(MI, MRI, B, false);
1546   case TargetOpcode::G_FPTOSI:
1547     return legalizeFPTOI(MI, MRI, B, true);
1548   case TargetOpcode::G_FPTOUI:
1549     return legalizeFPTOI(MI, MRI, B, false);
1550   case TargetOpcode::G_FMINNUM:
1551   case TargetOpcode::G_FMAXNUM:
1552   case TargetOpcode::G_FMINNUM_IEEE:
1553   case TargetOpcode::G_FMAXNUM_IEEE:
1554     return legalizeMinNumMaxNum(Helper, MI);
1555   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1556     return legalizeExtractVectorElt(MI, MRI, B);
1557   case TargetOpcode::G_INSERT_VECTOR_ELT:
1558     return legalizeInsertVectorElt(MI, MRI, B);
1559   case TargetOpcode::G_SHUFFLE_VECTOR:
1560     return legalizeShuffleVector(MI, MRI, B);
1561   case TargetOpcode::G_FSIN:
1562   case TargetOpcode::G_FCOS:
1563     return legalizeSinCos(MI, MRI, B);
1564   case TargetOpcode::G_GLOBAL_VALUE:
1565     return legalizeGlobalValue(MI, MRI, B);
1566   case TargetOpcode::G_LOAD:
1567     return legalizeLoad(MI, MRI, B, Observer);
1568   case TargetOpcode::G_FMAD:
1569     return legalizeFMad(MI, MRI, B);
1570   case TargetOpcode::G_FDIV:
1571     return legalizeFDIV(MI, MRI, B);
1572   case TargetOpcode::G_UDIV:
1573   case TargetOpcode::G_UREM:
1574     return legalizeUDIV_UREM(MI, MRI, B);
1575   case TargetOpcode::G_SDIV:
1576   case TargetOpcode::G_SREM:
1577     return legalizeSDIV_SREM(MI, MRI, B);
1578   case TargetOpcode::G_ATOMIC_CMPXCHG:
1579     return legalizeAtomicCmpXChg(MI, MRI, B);
1580   case TargetOpcode::G_FLOG:
1581     return legalizeFlog(MI, B, numbers::ln2f);
1582   case TargetOpcode::G_FLOG10:
1583     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1584   case TargetOpcode::G_FEXP:
1585     return legalizeFExp(MI, B);
1586   case TargetOpcode::G_FPOW:
1587     return legalizeFPow(MI, B);
1588   case TargetOpcode::G_FFLOOR:
1589     return legalizeFFloor(MI, MRI, B);
1590   case TargetOpcode::G_BUILD_VECTOR:
1591     return legalizeBuildVector(MI, MRI, B);
1592   default:
1593     return false;
1594   }
1595 
1596   llvm_unreachable("expected switch to return");
1597 }
1598 
1599 Register AMDGPULegalizerInfo::getSegmentAperture(
1600   unsigned AS,
1601   MachineRegisterInfo &MRI,
1602   MachineIRBuilder &B) const {
1603   MachineFunction &MF = B.getMF();
1604   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1605   const LLT S32 = LLT::scalar(32);
1606 
1607   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1608 
1609   if (ST.hasApertureRegs()) {
1610     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1611     // getreg.
1612     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1613         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1614         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1615     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1616         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1617         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1618     unsigned Encoding =
1619         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1620         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1621         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1622 
1623     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1624 
1625     B.buildInstr(AMDGPU::S_GETREG_B32)
1626       .addDef(GetReg)
1627       .addImm(Encoding);
1628     MRI.setType(GetReg, S32);
1629 
1630     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1631     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1632   }
1633 
1634   Register QueuePtr = MRI.createGenericVirtualRegister(
1635     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1636 
1637   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1638     return Register();
1639 
1640   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1641   // private_segment_aperture_base_hi.
1642   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1643 
1644   // TODO: can we be smarter about machine pointer info?
1645   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1646   MachineMemOperand *MMO = MF.getMachineMemOperand(
1647       PtrInfo,
1648       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1649           MachineMemOperand::MOInvariant,
1650       4, commonAlignment(Align(64), StructOffset));
1651 
1652   Register LoadAddr;
1653 
1654   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1655   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1656 }
1657 
1658 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1659   MachineInstr &MI, MachineRegisterInfo &MRI,
1660   MachineIRBuilder &B) const {
1661   MachineFunction &MF = B.getMF();
1662 
1663   const LLT S32 = LLT::scalar(32);
1664   Register Dst = MI.getOperand(0).getReg();
1665   Register Src = MI.getOperand(1).getReg();
1666 
1667   LLT DstTy = MRI.getType(Dst);
1668   LLT SrcTy = MRI.getType(Src);
1669   unsigned DestAS = DstTy.getAddressSpace();
1670   unsigned SrcAS = SrcTy.getAddressSpace();
1671 
1672   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1673   // vector element.
1674   assert(!DstTy.isVector());
1675 
1676   const AMDGPUTargetMachine &TM
1677     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1678 
1679   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1680   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1681     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1682     return true;
1683   }
1684 
1685   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1686     // Truncate.
1687     B.buildExtract(Dst, Src, 0);
1688     MI.eraseFromParent();
1689     return true;
1690   }
1691 
1692   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1693     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1694     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1695 
1696     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1697     // another. Merge operands are required to be the same type, but creating an
1698     // extra ptrtoint would be kind of pointless.
1699     auto HighAddr = B.buildConstant(
1700       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1701     B.buildMerge(Dst, {Src, HighAddr});
1702     MI.eraseFromParent();
1703     return true;
1704   }
1705 
1706   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1707     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1708            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1709     unsigned NullVal = TM.getNullPointerValue(DestAS);
1710 
1711     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1712     auto FlatNull = B.buildConstant(SrcTy, 0);
1713 
1714     // Extract low 32-bits of the pointer.
1715     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1716 
1717     auto CmpRes =
1718         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1719     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1720 
1721     MI.eraseFromParent();
1722     return true;
1723   }
1724 
1725   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1726     return false;
1727 
1728   if (!ST.hasFlatAddressSpace())
1729     return false;
1730 
1731   auto SegmentNull =
1732       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1733   auto FlatNull =
1734       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1735 
1736   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1737   if (!ApertureReg.isValid())
1738     return false;
1739 
1740   auto CmpRes =
1741       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1742 
1743   // Coerce the type of the low half of the result so we can use merge_values.
1744   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1745 
1746   // TODO: Should we allow mismatched types but matching sizes in merges to
1747   // avoid the ptrtoint?
1748   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1749   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1750 
1751   MI.eraseFromParent();
1752   return true;
1753 }
1754 
1755 bool AMDGPULegalizerInfo::legalizeFrint(
1756   MachineInstr &MI, MachineRegisterInfo &MRI,
1757   MachineIRBuilder &B) const {
1758   Register Src = MI.getOperand(1).getReg();
1759   LLT Ty = MRI.getType(Src);
1760   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1761 
1762   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1763   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1764 
1765   auto C1 = B.buildFConstant(Ty, C1Val);
1766   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1767 
1768   // TODO: Should this propagate fast-math-flags?
1769   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1770   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1771 
1772   auto C2 = B.buildFConstant(Ty, C2Val);
1773   auto Fabs = B.buildFAbs(Ty, Src);
1774 
1775   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1776   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1777   MI.eraseFromParent();
1778   return true;
1779 }
1780 
1781 bool AMDGPULegalizerInfo::legalizeFceil(
1782   MachineInstr &MI, MachineRegisterInfo &MRI,
1783   MachineIRBuilder &B) const {
1784 
1785   const LLT S1 = LLT::scalar(1);
1786   const LLT S64 = LLT::scalar(64);
1787 
1788   Register Src = MI.getOperand(1).getReg();
1789   assert(MRI.getType(Src) == S64);
1790 
1791   // result = trunc(src)
1792   // if (src > 0.0 && src != result)
1793   //   result += 1.0
1794 
1795   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1796 
1797   const auto Zero = B.buildFConstant(S64, 0.0);
1798   const auto One = B.buildFConstant(S64, 1.0);
1799   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1800   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1801   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1802   auto Add = B.buildSelect(S64, And, One, Zero);
1803 
1804   // TODO: Should this propagate fast-math-flags?
1805   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1806   return true;
1807 }
1808 
1809 static MachineInstrBuilder extractF64Exponent(Register Hi,
1810                                               MachineIRBuilder &B) {
1811   const unsigned FractBits = 52;
1812   const unsigned ExpBits = 11;
1813   LLT S32 = LLT::scalar(32);
1814 
1815   auto Const0 = B.buildConstant(S32, FractBits - 32);
1816   auto Const1 = B.buildConstant(S32, ExpBits);
1817 
1818   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1819     .addUse(Hi)
1820     .addUse(Const0.getReg(0))
1821     .addUse(Const1.getReg(0));
1822 
1823   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1824 }
1825 
1826 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1827   MachineInstr &MI, MachineRegisterInfo &MRI,
1828   MachineIRBuilder &B) const {
1829   const LLT S1 = LLT::scalar(1);
1830   const LLT S32 = LLT::scalar(32);
1831   const LLT S64 = LLT::scalar(64);
1832 
1833   Register Src = MI.getOperand(1).getReg();
1834   assert(MRI.getType(Src) == S64);
1835 
1836   // TODO: Should this use extract since the low half is unused?
1837   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1838   Register Hi = Unmerge.getReg(1);
1839 
1840   // Extract the upper half, since this is where we will find the sign and
1841   // exponent.
1842   auto Exp = extractF64Exponent(Hi, B);
1843 
1844   const unsigned FractBits = 52;
1845 
1846   // Extract the sign bit.
1847   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1848   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1849 
1850   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1851 
1852   const auto Zero32 = B.buildConstant(S32, 0);
1853 
1854   // Extend back to 64-bits.
1855   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1856 
1857   auto Shr = B.buildAShr(S64, FractMask, Exp);
1858   auto Not = B.buildNot(S64, Shr);
1859   auto Tmp0 = B.buildAnd(S64, Src, Not);
1860   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1861 
1862   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1863   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1864 
1865   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1866   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1867   MI.eraseFromParent();
1868   return true;
1869 }
1870 
1871 bool AMDGPULegalizerInfo::legalizeITOFP(
1872   MachineInstr &MI, MachineRegisterInfo &MRI,
1873   MachineIRBuilder &B, bool Signed) const {
1874 
1875   Register Dst = MI.getOperand(0).getReg();
1876   Register Src = MI.getOperand(1).getReg();
1877 
1878   const LLT S64 = LLT::scalar(64);
1879   const LLT S32 = LLT::scalar(32);
1880 
1881   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1882 
1883   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1884 
1885   auto CvtHi = Signed ?
1886     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1887     B.buildUITOFP(S64, Unmerge.getReg(1));
1888 
1889   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1890 
1891   auto ThirtyTwo = B.buildConstant(S32, 32);
1892   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1893     .addUse(CvtHi.getReg(0))
1894     .addUse(ThirtyTwo.getReg(0));
1895 
1896   // TODO: Should this propagate fast-math-flags?
1897   B.buildFAdd(Dst, LdExp, CvtLo);
1898   MI.eraseFromParent();
1899   return true;
1900 }
1901 
1902 // TODO: Copied from DAG implementation. Verify logic and document how this
1903 // actually works.
1904 bool AMDGPULegalizerInfo::legalizeFPTOI(
1905   MachineInstr &MI, MachineRegisterInfo &MRI,
1906   MachineIRBuilder &B, bool Signed) const {
1907 
1908   Register Dst = MI.getOperand(0).getReg();
1909   Register Src = MI.getOperand(1).getReg();
1910 
1911   const LLT S64 = LLT::scalar(64);
1912   const LLT S32 = LLT::scalar(32);
1913 
1914   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1915 
1916   unsigned Flags = MI.getFlags();
1917 
1918   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1919   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1920   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1921 
1922   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1923   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1924   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1925 
1926   auto Hi = Signed ?
1927     B.buildFPTOSI(S32, FloorMul) :
1928     B.buildFPTOUI(S32, FloorMul);
1929   auto Lo = B.buildFPTOUI(S32, Fma);
1930 
1931   B.buildMerge(Dst, { Lo, Hi });
1932   MI.eraseFromParent();
1933 
1934   return true;
1935 }
1936 
1937 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1938                                                MachineInstr &MI) const {
1939   MachineFunction &MF = Helper.MIRBuilder.getMF();
1940   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1941 
1942   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1943                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1944 
1945   // With ieee_mode disabled, the instructions have the correct behavior
1946   // already for G_FMINNUM/G_FMAXNUM
1947   if (!MFI->getMode().IEEE)
1948     return !IsIEEEOp;
1949 
1950   if (IsIEEEOp)
1951     return true;
1952 
1953   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1954 }
1955 
1956 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1957   MachineInstr &MI, MachineRegisterInfo &MRI,
1958   MachineIRBuilder &B) const {
1959   // TODO: Should move some of this into LegalizerHelper.
1960 
1961   // TODO: Promote dynamic indexing of s16 to s32
1962 
1963   // FIXME: Artifact combiner probably should have replaced the truncated
1964   // constant before this, so we shouldn't need
1965   // getConstantVRegValWithLookThrough.
1966   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1967     MI.getOperand(2).getReg(), MRI);
1968   if (!IdxVal) // Dynamic case will be selected to register indexing.
1969     return true;
1970 
1971   Register Dst = MI.getOperand(0).getReg();
1972   Register Vec = MI.getOperand(1).getReg();
1973 
1974   LLT VecTy = MRI.getType(Vec);
1975   LLT EltTy = VecTy.getElementType();
1976   assert(EltTy == MRI.getType(Dst));
1977 
1978   if (IdxVal->Value < VecTy.getNumElements())
1979     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1980   else
1981     B.buildUndef(Dst);
1982 
1983   MI.eraseFromParent();
1984   return true;
1985 }
1986 
1987 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1988   MachineInstr &MI, MachineRegisterInfo &MRI,
1989   MachineIRBuilder &B) const {
1990   // TODO: Should move some of this into LegalizerHelper.
1991 
1992   // TODO: Promote dynamic indexing of s16 to s32
1993 
1994   // FIXME: Artifact combiner probably should have replaced the truncated
1995   // constant before this, so we shouldn't need
1996   // getConstantVRegValWithLookThrough.
1997   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1998     MI.getOperand(3).getReg(), MRI);
1999   if (!IdxVal) // Dynamic case will be selected to register indexing.
2000     return true;
2001 
2002   Register Dst = MI.getOperand(0).getReg();
2003   Register Vec = MI.getOperand(1).getReg();
2004   Register Ins = MI.getOperand(2).getReg();
2005 
2006   LLT VecTy = MRI.getType(Vec);
2007   LLT EltTy = VecTy.getElementType();
2008   assert(EltTy == MRI.getType(Ins));
2009 
2010   if (IdxVal->Value < VecTy.getNumElements())
2011     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2012   else
2013     B.buildUndef(Dst);
2014 
2015   MI.eraseFromParent();
2016   return true;
2017 }
2018 
2019 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2020   MachineInstr &MI, MachineRegisterInfo &MRI,
2021   MachineIRBuilder &B) const {
2022   const LLT V2S16 = LLT::vector(2, 16);
2023 
2024   Register Dst = MI.getOperand(0).getReg();
2025   Register Src0 = MI.getOperand(1).getReg();
2026   LLT DstTy = MRI.getType(Dst);
2027   LLT SrcTy = MRI.getType(Src0);
2028 
2029   if (SrcTy == V2S16 && DstTy == V2S16 &&
2030       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2031     return true;
2032 
2033   MachineIRBuilder HelperBuilder(MI);
2034   GISelObserverWrapper DummyObserver;
2035   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2036   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2037 }
2038 
2039 bool AMDGPULegalizerInfo::legalizeSinCos(
2040   MachineInstr &MI, MachineRegisterInfo &MRI,
2041   MachineIRBuilder &B) const {
2042 
2043   Register DstReg = MI.getOperand(0).getReg();
2044   Register SrcReg = MI.getOperand(1).getReg();
2045   LLT Ty = MRI.getType(DstReg);
2046   unsigned Flags = MI.getFlags();
2047 
2048   Register TrigVal;
2049   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2050   if (ST.hasTrigReducedRange()) {
2051     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2052     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2053       .addUse(MulVal.getReg(0))
2054       .setMIFlags(Flags).getReg(0);
2055   } else
2056     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2057 
2058   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2059     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2060   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2061     .addUse(TrigVal)
2062     .setMIFlags(Flags);
2063   MI.eraseFromParent();
2064   return true;
2065 }
2066 
2067 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2068                                                   MachineIRBuilder &B,
2069                                                   const GlobalValue *GV,
2070                                                   int64_t Offset,
2071                                                   unsigned GAFlags) const {
2072   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2073   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2074   // to the following code sequence:
2075   //
2076   // For constant address space:
2077   //   s_getpc_b64 s[0:1]
2078   //   s_add_u32 s0, s0, $symbol
2079   //   s_addc_u32 s1, s1, 0
2080   //
2081   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2082   //   a fixup or relocation is emitted to replace $symbol with a literal
2083   //   constant, which is a pc-relative offset from the encoding of the $symbol
2084   //   operand to the global variable.
2085   //
2086   // For global address space:
2087   //   s_getpc_b64 s[0:1]
2088   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2089   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2090   //
2091   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2092   //   fixups or relocations are emitted to replace $symbol@*@lo and
2093   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2094   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2095   //   operand to the global variable.
2096   //
2097   // What we want here is an offset from the value returned by s_getpc
2098   // (which is the address of the s_add_u32 instruction) to the global
2099   // variable, but since the encoding of $symbol starts 4 bytes after the start
2100   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2101   // small. This requires us to add 4 to the global variable offset in order to
2102   // compute the correct address.
2103 
2104   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2105 
2106   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2107     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2108 
2109   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2110     .addDef(PCReg);
2111 
2112   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2113   if (GAFlags == SIInstrInfo::MO_NONE)
2114     MIB.addImm(0);
2115   else
2116     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2117 
2118   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2119 
2120   if (PtrTy.getSizeInBits() == 32)
2121     B.buildExtract(DstReg, PCReg, 0);
2122   return true;
2123  }
2124 
2125 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2126   MachineInstr &MI, MachineRegisterInfo &MRI,
2127   MachineIRBuilder &B) const {
2128   Register DstReg = MI.getOperand(0).getReg();
2129   LLT Ty = MRI.getType(DstReg);
2130   unsigned AS = Ty.getAddressSpace();
2131 
2132   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2133   MachineFunction &MF = B.getMF();
2134   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2135 
2136   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2137     if (!MFI->isEntryFunction()) {
2138       const Function &Fn = MF.getFunction();
2139       DiagnosticInfoUnsupported BadLDSDecl(
2140         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2141         DS_Warning);
2142       Fn.getContext().diagnose(BadLDSDecl);
2143 
2144       // We currently don't have a way to correctly allocate LDS objects that
2145       // aren't directly associated with a kernel. We do force inlining of
2146       // functions that use local objects. However, if these dead functions are
2147       // not eliminated, we don't want a compile time error. Just emit a warning
2148       // and a trap, since there should be no callable path here.
2149       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2150       B.buildUndef(DstReg);
2151       MI.eraseFromParent();
2152       return true;
2153     }
2154 
2155     // TODO: We could emit code to handle the initialization somewhere.
2156     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2157       const SITargetLowering *TLI = ST.getTargetLowering();
2158       if (!TLI->shouldUseLDSConstAddress(GV)) {
2159         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2160         return true; // Leave in place;
2161       }
2162 
2163       B.buildConstant(
2164           DstReg,
2165           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2166       MI.eraseFromParent();
2167       return true;
2168     }
2169 
2170     const Function &Fn = MF.getFunction();
2171     DiagnosticInfoUnsupported BadInit(
2172       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2173     Fn.getContext().diagnose(BadInit);
2174     return true;
2175   }
2176 
2177   const SITargetLowering *TLI = ST.getTargetLowering();
2178 
2179   if (TLI->shouldEmitFixup(GV)) {
2180     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2181     MI.eraseFromParent();
2182     return true;
2183   }
2184 
2185   if (TLI->shouldEmitPCReloc(GV)) {
2186     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2187     MI.eraseFromParent();
2188     return true;
2189   }
2190 
2191   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2192   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2193 
2194   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2195       MachinePointerInfo::getGOT(MF),
2196       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2197           MachineMemOperand::MOInvariant,
2198       8 /*Size*/, Align(8));
2199 
2200   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2201 
2202   if (Ty.getSizeInBits() == 32) {
2203     // Truncate if this is a 32-bit constant adrdess.
2204     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2205     B.buildExtract(DstReg, Load, 0);
2206   } else
2207     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2208 
2209   MI.eraseFromParent();
2210   return true;
2211 }
2212 
2213 bool AMDGPULegalizerInfo::legalizeLoad(
2214   MachineInstr &MI, MachineRegisterInfo &MRI,
2215   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2216   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2217   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2218   Observer.changingInstr(MI);
2219   MI.getOperand(1).setReg(Cast.getReg(0));
2220   Observer.changedInstr(MI);
2221   return true;
2222 }
2223 
2224 bool AMDGPULegalizerInfo::legalizeFMad(
2225   MachineInstr &MI, MachineRegisterInfo &MRI,
2226   MachineIRBuilder &B) const {
2227   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2228   assert(Ty.isScalar());
2229 
2230   MachineFunction &MF = B.getMF();
2231   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2232 
2233   // TODO: Always legal with future ftz flag.
2234   // FIXME: Do we need just output?
2235   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2236     return true;
2237   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2238     return true;
2239 
2240   MachineIRBuilder HelperBuilder(MI);
2241   GISelObserverWrapper DummyObserver;
2242   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2243   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2244 }
2245 
2246 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2247   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2248   Register DstReg = MI.getOperand(0).getReg();
2249   Register PtrReg = MI.getOperand(1).getReg();
2250   Register CmpVal = MI.getOperand(2).getReg();
2251   Register NewVal = MI.getOperand(3).getReg();
2252 
2253   assert(SITargetLowering::isFlatGlobalAddrSpace(
2254            MRI.getType(PtrReg).getAddressSpace()) &&
2255          "this should not have been custom lowered");
2256 
2257   LLT ValTy = MRI.getType(CmpVal);
2258   LLT VecTy = LLT::vector(2, ValTy);
2259 
2260   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2261 
2262   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2263     .addDef(DstReg)
2264     .addUse(PtrReg)
2265     .addUse(PackedVal)
2266     .setMemRefs(MI.memoperands());
2267 
2268   MI.eraseFromParent();
2269   return true;
2270 }
2271 
2272 bool AMDGPULegalizerInfo::legalizeFlog(
2273   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2274   Register Dst = MI.getOperand(0).getReg();
2275   Register Src = MI.getOperand(1).getReg();
2276   LLT Ty = B.getMRI()->getType(Dst);
2277   unsigned Flags = MI.getFlags();
2278 
2279   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2280   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2281 
2282   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2283   MI.eraseFromParent();
2284   return true;
2285 }
2286 
2287 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2288                                        MachineIRBuilder &B) const {
2289   Register Dst = MI.getOperand(0).getReg();
2290   Register Src = MI.getOperand(1).getReg();
2291   unsigned Flags = MI.getFlags();
2292   LLT Ty = B.getMRI()->getType(Dst);
2293 
2294   auto K = B.buildFConstant(Ty, numbers::log2e);
2295   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2296   B.buildFExp2(Dst, Mul, Flags);
2297   MI.eraseFromParent();
2298   return true;
2299 }
2300 
2301 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2302                                        MachineIRBuilder &B) const {
2303   Register Dst = MI.getOperand(0).getReg();
2304   Register Src0 = MI.getOperand(1).getReg();
2305   Register Src1 = MI.getOperand(2).getReg();
2306   unsigned Flags = MI.getFlags();
2307   LLT Ty = B.getMRI()->getType(Dst);
2308   const LLT S16 = LLT::scalar(16);
2309   const LLT S32 = LLT::scalar(32);
2310 
2311   if (Ty == S32) {
2312     auto Log = B.buildFLog2(S32, Src0, Flags);
2313     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2314       .addUse(Log.getReg(0))
2315       .addUse(Src1)
2316       .setMIFlags(Flags);
2317     B.buildFExp2(Dst, Mul, Flags);
2318   } else if (Ty == S16) {
2319     // There's no f16 fmul_legacy, so we need to convert for it.
2320     auto Log = B.buildFLog2(S16, Src0, Flags);
2321     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2322     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2323     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2324       .addUse(Ext0.getReg(0))
2325       .addUse(Ext1.getReg(0))
2326       .setMIFlags(Flags);
2327 
2328     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2329   } else
2330     return false;
2331 
2332   MI.eraseFromParent();
2333   return true;
2334 }
2335 
2336 // Find a source register, ignoring any possible source modifiers.
2337 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2338   Register ModSrc = OrigSrc;
2339   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2340     ModSrc = SrcFNeg->getOperand(1).getReg();
2341     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2342       ModSrc = SrcFAbs->getOperand(1).getReg();
2343   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2344     ModSrc = SrcFAbs->getOperand(1).getReg();
2345   return ModSrc;
2346 }
2347 
2348 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2349                                          MachineRegisterInfo &MRI,
2350                                          MachineIRBuilder &B) const {
2351 
2352   const LLT S1 = LLT::scalar(1);
2353   const LLT S64 = LLT::scalar(64);
2354   Register Dst = MI.getOperand(0).getReg();
2355   Register OrigSrc = MI.getOperand(1).getReg();
2356   unsigned Flags = MI.getFlags();
2357   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2358          "this should not have been custom lowered");
2359 
2360   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2361   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2362   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2363   // V_FRACT bug is:
2364   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2365   //
2366   // Convert floor(x) to (x - fract(x))
2367 
2368   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2369     .addUse(OrigSrc)
2370     .setMIFlags(Flags);
2371 
2372   // Give source modifier matching some assistance before obscuring a foldable
2373   // pattern.
2374 
2375   // TODO: We can avoid the neg on the fract? The input sign to fract
2376   // shouldn't matter?
2377   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2378 
2379   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2380 
2381   Register Min = MRI.createGenericVirtualRegister(S64);
2382 
2383   // We don't need to concern ourselves with the snan handling difference, so
2384   // use the one which will directly select.
2385   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2386   if (MFI->getMode().IEEE)
2387     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2388   else
2389     B.buildFMinNum(Min, Fract, Const, Flags);
2390 
2391   Register CorrectedFract = Min;
2392   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2393     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2394     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2395   }
2396 
2397   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2398   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2399 
2400   MI.eraseFromParent();
2401   return true;
2402 }
2403 
2404 // Turn an illegal packed v2s16 build vector into bit operations.
2405 // TODO: This should probably be a bitcast action in LegalizerHelper.
2406 bool AMDGPULegalizerInfo::legalizeBuildVector(
2407   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2408   Register Dst = MI.getOperand(0).getReg();
2409   const LLT S32 = LLT::scalar(32);
2410   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2411 
2412   Register Src0 = MI.getOperand(1).getReg();
2413   Register Src1 = MI.getOperand(2).getReg();
2414   assert(MRI.getType(Src0) == LLT::scalar(16));
2415 
2416   auto Merge = B.buildMerge(S32, {Src0, Src1});
2417   B.buildBitcast(Dst, Merge);
2418 
2419   MI.eraseFromParent();
2420   return true;
2421 }
2422 
2423 // Return the use branch instruction, otherwise null if the usage is invalid.
2424 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2425                                        MachineRegisterInfo &MRI,
2426                                        MachineInstr *&Br,
2427                                        MachineBasicBlock *&UncondBrTarget) {
2428   Register CondDef = MI.getOperand(0).getReg();
2429   if (!MRI.hasOneNonDBGUse(CondDef))
2430     return nullptr;
2431 
2432   MachineBasicBlock *Parent = MI.getParent();
2433   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2434   if (UseMI.getParent() != Parent ||
2435       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2436     return nullptr;
2437 
2438   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2439   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2440   if (Next == Parent->end()) {
2441     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2442     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2443       return nullptr;
2444     UncondBrTarget = &*NextMBB;
2445   } else {
2446     if (Next->getOpcode() != AMDGPU::G_BR)
2447       return nullptr;
2448     Br = &*Next;
2449     UncondBrTarget = Br->getOperand(0).getMBB();
2450   }
2451 
2452   return &UseMI;
2453 }
2454 
2455 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2456                                                MachineRegisterInfo &MRI,
2457                                                Register LiveIn,
2458                                                Register PhyReg) const {
2459   assert(PhyReg.isPhysical() && "Physical register expected");
2460 
2461   // Insert the live-in copy, if required, by defining destination virtual
2462   // register.
2463   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2464   if (!MRI.getVRegDef(LiveIn)) {
2465     // FIXME: Should have scoped insert pt
2466     MachineBasicBlock &OrigInsBB = B.getMBB();
2467     auto OrigInsPt = B.getInsertPt();
2468 
2469     MachineBasicBlock &EntryMBB = B.getMF().front();
2470     EntryMBB.addLiveIn(PhyReg);
2471     B.setInsertPt(EntryMBB, EntryMBB.begin());
2472     B.buildCopy(LiveIn, PhyReg);
2473 
2474     B.setInsertPt(OrigInsBB, OrigInsPt);
2475   }
2476 
2477   return LiveIn;
2478 }
2479 
2480 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2481                                                 MachineRegisterInfo &MRI,
2482                                                 Register PhyReg, LLT Ty,
2483                                                 bool InsertLiveInCopy) const {
2484   assert(PhyReg.isPhysical() && "Physical register expected");
2485 
2486   // Get or create virtual live-in regester
2487   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2488   if (!LiveIn) {
2489     LiveIn = MRI.createGenericVirtualRegister(Ty);
2490     MRI.addLiveIn(PhyReg, LiveIn);
2491   }
2492 
2493   // When the actual true copy required is from virtual register to physical
2494   // register (to be inserted later), live-in copy insertion from physical
2495   // to register virtual register is not required
2496   if (!InsertLiveInCopy)
2497     return LiveIn;
2498 
2499   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2500 }
2501 
2502 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2503                                          const ArgDescriptor *Arg,
2504                                          const TargetRegisterClass *ArgRC,
2505                                          LLT ArgTy) const {
2506   MCRegister SrcReg = Arg->getRegister();
2507   assert(SrcReg.isPhysical() && "Physical register expected");
2508   assert(DstReg.isVirtual() && "Virtual register expected");
2509 
2510   MachineRegisterInfo &MRI = *B.getMRI();
2511   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy);
2512 
2513   if (Arg->isMasked()) {
2514     // TODO: Should we try to emit this once in the entry block?
2515     const LLT S32 = LLT::scalar(32);
2516     const unsigned Mask = Arg->getMask();
2517     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2518 
2519     Register AndMaskSrc = LiveIn;
2520 
2521     if (Shift != 0) {
2522       auto ShiftAmt = B.buildConstant(S32, Shift);
2523       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2524     }
2525 
2526     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2527   } else {
2528     B.buildCopy(DstReg, LiveIn);
2529   }
2530 
2531   return true;
2532 }
2533 
2534 bool AMDGPULegalizerInfo::loadInputValue(
2535     Register DstReg, MachineIRBuilder &B,
2536     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2537   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2538   const ArgDescriptor *Arg;
2539   const TargetRegisterClass *ArgRC;
2540   LLT ArgTy;
2541   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2542 
2543   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2544     return false; // TODO: Handle these
2545   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2546 }
2547 
2548 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2549     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2550     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2551   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2552     return false;
2553 
2554   MI.eraseFromParent();
2555   return true;
2556 }
2557 
2558 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2559                                        MachineRegisterInfo &MRI,
2560                                        MachineIRBuilder &B) const {
2561   Register Dst = MI.getOperand(0).getReg();
2562   LLT DstTy = MRI.getType(Dst);
2563   LLT S16 = LLT::scalar(16);
2564   LLT S32 = LLT::scalar(32);
2565   LLT S64 = LLT::scalar(64);
2566 
2567   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2568     return true;
2569 
2570   if (DstTy == S16)
2571     return legalizeFDIV16(MI, MRI, B);
2572   if (DstTy == S32)
2573     return legalizeFDIV32(MI, MRI, B);
2574   if (DstTy == S64)
2575     return legalizeFDIV64(MI, MRI, B);
2576 
2577   return false;
2578 }
2579 
2580 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2581                                                   Register DstReg,
2582                                                   Register X,
2583                                                   Register Y,
2584                                                   bool IsDiv) const {
2585   const LLT S1 = LLT::scalar(1);
2586   const LLT S32 = LLT::scalar(32);
2587 
2588   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2589   // algorithm used here.
2590 
2591   // Initial estimate of inv(y).
2592   auto FloatY = B.buildUITOFP(S32, Y);
2593   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2594   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2595   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2596   auto Z = B.buildFPTOUI(S32, ScaledY);
2597 
2598   // One round of UNR.
2599   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2600   auto NegYZ = B.buildMul(S32, NegY, Z);
2601   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2602 
2603   // Quotient/remainder estimate.
2604   auto Q = B.buildUMulH(S32, X, Z);
2605   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2606 
2607   // First quotient/remainder refinement.
2608   auto One = B.buildConstant(S32, 1);
2609   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2610   if (IsDiv)
2611     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2612   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2613 
2614   // Second quotient/remainder refinement.
2615   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2616   if (IsDiv)
2617     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2618   else
2619     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2620 }
2621 
2622 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2623                                               MachineRegisterInfo &MRI,
2624                                               MachineIRBuilder &B) const {
2625   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2626   Register DstReg = MI.getOperand(0).getReg();
2627   Register Num = MI.getOperand(1).getReg();
2628   Register Den = MI.getOperand(2).getReg();
2629   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2630   MI.eraseFromParent();
2631   return true;
2632 }
2633 
2634 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2635 //
2636 // Return lo, hi of result
2637 //
2638 // %cvt.lo = G_UITOFP Val.lo
2639 // %cvt.hi = G_UITOFP Val.hi
2640 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2641 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2642 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2643 // %mul2 = G_FMUL %mul1, 2**(-32)
2644 // %trunc = G_INTRINSIC_TRUNC %mul2
2645 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2646 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2647 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2648                                                        Register Val) {
2649   const LLT S32 = LLT::scalar(32);
2650   auto Unmerge = B.buildUnmerge(S32, Val);
2651 
2652   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2653   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2654 
2655   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2656                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2657 
2658   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2659   auto Mul1 =
2660       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2661 
2662   // 2**(-32)
2663   auto Mul2 =
2664       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2665   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2666 
2667   // -(2**32)
2668   auto Mad2 = B.buildFMAD(S32, Trunc,
2669                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2670 
2671   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2672   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2673 
2674   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2675 }
2676 
2677 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2678                                                   Register DstReg,
2679                                                   Register Numer,
2680                                                   Register Denom,
2681                                                   bool IsDiv) const {
2682   const LLT S32 = LLT::scalar(32);
2683   const LLT S64 = LLT::scalar(64);
2684   const LLT S1 = LLT::scalar(1);
2685   Register RcpLo, RcpHi;
2686 
2687   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2688 
2689   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2690 
2691   auto Zero64 = B.buildConstant(S64, 0);
2692   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2693 
2694   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2695   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2696 
2697   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2698   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2699   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2700 
2701   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2702   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2703   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2704   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2705 
2706   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2707   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2708   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2709   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2710   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2711 
2712   auto Zero32 = B.buildConstant(S32, 0);
2713   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2714   auto Add2_HiC =
2715       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2716   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2717   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2718 
2719   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2720   Register NumerLo = UnmergeNumer.getReg(0);
2721   Register NumerHi = UnmergeNumer.getReg(1);
2722 
2723   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2724   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2725   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2726   Register Mul3_Lo = UnmergeMul3.getReg(0);
2727   Register Mul3_Hi = UnmergeMul3.getReg(1);
2728   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2729   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2730   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2731   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2732 
2733   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2734   Register DenomLo = UnmergeDenom.getReg(0);
2735   Register DenomHi = UnmergeDenom.getReg(1);
2736 
2737   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2738   auto C1 = B.buildSExt(S32, CmpHi);
2739 
2740   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2741   auto C2 = B.buildSExt(S32, CmpLo);
2742 
2743   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2744   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2745 
2746   // TODO: Here and below portions of the code can be enclosed into if/endif.
2747   // Currently control flow is unconditional and we have 4 selects after
2748   // potential endif to substitute PHIs.
2749 
2750   // if C3 != 0 ...
2751   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2752   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2753   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2754   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2755 
2756   auto One64 = B.buildConstant(S64, 1);
2757   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2758 
2759   auto C4 =
2760       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2761   auto C5 =
2762       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2763   auto C6 = B.buildSelect(
2764       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2765 
2766   // if (C6 != 0)
2767   auto Add4 = B.buildAdd(S64, Add3, One64);
2768   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2769 
2770   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2771   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2772   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2773 
2774   // endif C6
2775   // endif C3
2776 
2777   if (IsDiv) {
2778     auto Sel1 = B.buildSelect(
2779         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2780     B.buildSelect(DstReg,
2781                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2782   } else {
2783     auto Sel2 = B.buildSelect(
2784         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2785     B.buildSelect(DstReg,
2786                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2787   }
2788 }
2789 
2790 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2791                                             MachineRegisterInfo &MRI,
2792                                             MachineIRBuilder &B) const {
2793   const LLT S64 = LLT::scalar(64);
2794   const LLT S32 = LLT::scalar(32);
2795   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2796   Register DstReg = MI.getOperand(0).getReg();
2797   Register Num = MI.getOperand(1).getReg();
2798   Register Den = MI.getOperand(2).getReg();
2799   LLT Ty = MRI.getType(DstReg);
2800 
2801   if (Ty == S32)
2802     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2803   else if (Ty == S64)
2804     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2805   else
2806     return false;
2807 
2808   MI.eraseFromParent();
2809   return true;
2810 
2811 }
2812 
2813 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2814                                             MachineRegisterInfo &MRI,
2815                                             MachineIRBuilder &B) const {
2816   const LLT S64 = LLT::scalar(64);
2817   const LLT S32 = LLT::scalar(32);
2818 
2819   Register DstReg = MI.getOperand(0).getReg();
2820   const LLT Ty = MRI.getType(DstReg);
2821   if (Ty != S32 && Ty != S64)
2822     return false;
2823 
2824   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2825 
2826   Register LHS = MI.getOperand(1).getReg();
2827   Register RHS = MI.getOperand(2).getReg();
2828 
2829   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2830   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2831   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2832 
2833   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2834   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2835 
2836   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2837   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2838 
2839   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2840   if (Ty == S32)
2841     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2842   else
2843     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2844 
2845   Register Sign;
2846   if (IsDiv)
2847     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2848   else
2849     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2850 
2851   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2852   B.buildSub(DstReg, UDivRem, Sign);
2853 
2854   MI.eraseFromParent();
2855   return true;
2856 }
2857 
2858 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2859                                                  MachineRegisterInfo &MRI,
2860                                                  MachineIRBuilder &B) const {
2861   Register Res = MI.getOperand(0).getReg();
2862   Register LHS = MI.getOperand(1).getReg();
2863   Register RHS = MI.getOperand(2).getReg();
2864 
2865   uint16_t Flags = MI.getFlags();
2866 
2867   LLT ResTy = MRI.getType(Res);
2868   LLT S32 = LLT::scalar(32);
2869   LLT S64 = LLT::scalar(64);
2870 
2871   const MachineFunction &MF = B.getMF();
2872   bool Unsafe =
2873     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2874 
2875   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2876     return false;
2877 
2878   if (!Unsafe && ResTy == S32 &&
2879       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2880     return false;
2881 
2882   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2883     // 1 / x -> RCP(x)
2884     if (CLHS->isExactlyValue(1.0)) {
2885       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2886         .addUse(RHS)
2887         .setMIFlags(Flags);
2888 
2889       MI.eraseFromParent();
2890       return true;
2891     }
2892 
2893     // -1 / x -> RCP( FNEG(x) )
2894     if (CLHS->isExactlyValue(-1.0)) {
2895       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2896       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2897         .addUse(FNeg.getReg(0))
2898         .setMIFlags(Flags);
2899 
2900       MI.eraseFromParent();
2901       return true;
2902     }
2903   }
2904 
2905   // x / y -> x * (1.0 / y)
2906   if (Unsafe) {
2907     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2908       .addUse(RHS)
2909       .setMIFlags(Flags);
2910     B.buildFMul(Res, LHS, RCP, Flags);
2911 
2912     MI.eraseFromParent();
2913     return true;
2914   }
2915 
2916   return false;
2917 }
2918 
2919 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2920                                          MachineRegisterInfo &MRI,
2921                                          MachineIRBuilder &B) const {
2922   Register Res = MI.getOperand(0).getReg();
2923   Register LHS = MI.getOperand(1).getReg();
2924   Register RHS = MI.getOperand(2).getReg();
2925 
2926   uint16_t Flags = MI.getFlags();
2927 
2928   LLT S16 = LLT::scalar(16);
2929   LLT S32 = LLT::scalar(32);
2930 
2931   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2932   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2933 
2934   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2935     .addUse(RHSExt.getReg(0))
2936     .setMIFlags(Flags);
2937 
2938   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2939   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2940 
2941   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2942     .addUse(RDst.getReg(0))
2943     .addUse(RHS)
2944     .addUse(LHS)
2945     .setMIFlags(Flags);
2946 
2947   MI.eraseFromParent();
2948   return true;
2949 }
2950 
2951 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2952 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2953 static void toggleSPDenormMode(bool Enable,
2954                                MachineIRBuilder &B,
2955                                const GCNSubtarget &ST,
2956                                AMDGPU::SIModeRegisterDefaults Mode) {
2957   // Set SP denorm mode to this value.
2958   unsigned SPDenormMode =
2959     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2960 
2961   if (ST.hasDenormModeInst()) {
2962     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2963     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2964 
2965     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2966     B.buildInstr(AMDGPU::S_DENORM_MODE)
2967       .addImm(NewDenormModeValue);
2968 
2969   } else {
2970     // Select FP32 bit field in mode register.
2971     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2972                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2973                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2974 
2975     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2976       .addImm(SPDenormMode)
2977       .addImm(SPDenormModeBitField);
2978   }
2979 }
2980 
2981 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2982                                          MachineRegisterInfo &MRI,
2983                                          MachineIRBuilder &B) const {
2984   Register Res = MI.getOperand(0).getReg();
2985   Register LHS = MI.getOperand(1).getReg();
2986   Register RHS = MI.getOperand(2).getReg();
2987   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2988   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2989 
2990   uint16_t Flags = MI.getFlags();
2991 
2992   LLT S32 = LLT::scalar(32);
2993   LLT S1 = LLT::scalar(1);
2994 
2995   auto One = B.buildFConstant(S32, 1.0f);
2996 
2997   auto DenominatorScaled =
2998     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2999       .addUse(LHS)
3000       .addUse(RHS)
3001       .addImm(0)
3002       .setMIFlags(Flags);
3003   auto NumeratorScaled =
3004     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3005       .addUse(LHS)
3006       .addUse(RHS)
3007       .addImm(1)
3008       .setMIFlags(Flags);
3009 
3010   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3011     .addUse(DenominatorScaled.getReg(0))
3012     .setMIFlags(Flags);
3013   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3014 
3015   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3016   // aren't modeled as reading it.
3017   if (!Mode.allFP32Denormals())
3018     toggleSPDenormMode(true, B, ST, Mode);
3019 
3020   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3021   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3022   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3023   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3024   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3025   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3026 
3027   if (!Mode.allFP32Denormals())
3028     toggleSPDenormMode(false, B, ST, Mode);
3029 
3030   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3031     .addUse(Fma4.getReg(0))
3032     .addUse(Fma1.getReg(0))
3033     .addUse(Fma3.getReg(0))
3034     .addUse(NumeratorScaled.getReg(1))
3035     .setMIFlags(Flags);
3036 
3037   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3038     .addUse(Fmas.getReg(0))
3039     .addUse(RHS)
3040     .addUse(LHS)
3041     .setMIFlags(Flags);
3042 
3043   MI.eraseFromParent();
3044   return true;
3045 }
3046 
3047 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3048                                          MachineRegisterInfo &MRI,
3049                                          MachineIRBuilder &B) const {
3050   Register Res = MI.getOperand(0).getReg();
3051   Register LHS = MI.getOperand(1).getReg();
3052   Register RHS = MI.getOperand(2).getReg();
3053 
3054   uint16_t Flags = MI.getFlags();
3055 
3056   LLT S64 = LLT::scalar(64);
3057   LLT S1 = LLT::scalar(1);
3058 
3059   auto One = B.buildFConstant(S64, 1.0);
3060 
3061   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3062     .addUse(LHS)
3063     .addUse(RHS)
3064     .addImm(0)
3065     .setMIFlags(Flags);
3066 
3067   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3068 
3069   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3070     .addUse(DivScale0.getReg(0))
3071     .setMIFlags(Flags);
3072 
3073   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3074   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3075   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3076 
3077   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3078     .addUse(LHS)
3079     .addUse(RHS)
3080     .addImm(1)
3081     .setMIFlags(Flags);
3082 
3083   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3084   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3085   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3086 
3087   Register Scale;
3088   if (!ST.hasUsableDivScaleConditionOutput()) {
3089     // Workaround a hardware bug on SI where the condition output from div_scale
3090     // is not usable.
3091 
3092     LLT S32 = LLT::scalar(32);
3093 
3094     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3095     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3096     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3097     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3098 
3099     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3100                               Scale1Unmerge.getReg(1));
3101     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3102                               Scale0Unmerge.getReg(1));
3103     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3104   } else {
3105     Scale = DivScale1.getReg(1);
3106   }
3107 
3108   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3109     .addUse(Fma4.getReg(0))
3110     .addUse(Fma3.getReg(0))
3111     .addUse(Mul.getReg(0))
3112     .addUse(Scale)
3113     .setMIFlags(Flags);
3114 
3115   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3116     .addUse(Fmas.getReg(0))
3117     .addUse(RHS)
3118     .addUse(LHS)
3119     .setMIFlags(Flags);
3120 
3121   MI.eraseFromParent();
3122   return true;
3123 }
3124 
3125 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3126                                                  MachineRegisterInfo &MRI,
3127                                                  MachineIRBuilder &B) const {
3128   Register Res = MI.getOperand(0).getReg();
3129   Register LHS = MI.getOperand(2).getReg();
3130   Register RHS = MI.getOperand(3).getReg();
3131   uint16_t Flags = MI.getFlags();
3132 
3133   LLT S32 = LLT::scalar(32);
3134   LLT S1 = LLT::scalar(1);
3135 
3136   auto Abs = B.buildFAbs(S32, RHS, Flags);
3137   const APFloat C0Val(1.0f);
3138 
3139   auto C0 = B.buildConstant(S32, 0x6f800000);
3140   auto C1 = B.buildConstant(S32, 0x2f800000);
3141   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3142 
3143   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3144   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3145 
3146   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3147 
3148   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3149     .addUse(Mul0.getReg(0))
3150     .setMIFlags(Flags);
3151 
3152   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3153 
3154   B.buildFMul(Res, Sel, Mul1, Flags);
3155 
3156   MI.eraseFromParent();
3157   return true;
3158 }
3159 
3160 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3161                                             MachineRegisterInfo &MRI,
3162                                             MachineIRBuilder &B) const {
3163   uint64_t Offset =
3164     ST.getTargetLowering()->getImplicitParameterOffset(
3165       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3166   LLT DstTy = MRI.getType(DstReg);
3167   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3168 
3169   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3170   if (!loadInputValue(KernargPtrReg, B,
3171                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3172     return false;
3173 
3174   // FIXME: This should be nuw
3175   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3176   return true;
3177 }
3178 
3179 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3180                                                  MachineRegisterInfo &MRI,
3181                                                  MachineIRBuilder &B) const {
3182   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3183   if (!MFI->isEntryFunction()) {
3184     return legalizePreloadedArgIntrin(MI, MRI, B,
3185                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3186   }
3187 
3188   Register DstReg = MI.getOperand(0).getReg();
3189   if (!getImplicitArgPtr(DstReg, MRI, B))
3190     return false;
3191 
3192   MI.eraseFromParent();
3193   return true;
3194 }
3195 
3196 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3197                                               MachineRegisterInfo &MRI,
3198                                               MachineIRBuilder &B,
3199                                               unsigned AddrSpace) const {
3200   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3201   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3202   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3203   MI.eraseFromParent();
3204   return true;
3205 }
3206 
3207 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3208 // offset (the offset that is included in bounds checking and swizzling, to be
3209 // split between the instruction's voffset and immoffset fields) and soffset
3210 // (the offset that is excluded from bounds checking and swizzling, to go in
3211 // the instruction's soffset field).  This function takes the first kind of
3212 // offset and figures out how to split it between voffset and immoffset.
3213 std::tuple<Register, unsigned, unsigned>
3214 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3215                                         Register OrigOffset) const {
3216   const unsigned MaxImm = 4095;
3217   Register BaseReg;
3218   unsigned TotalConstOffset;
3219   MachineInstr *OffsetDef;
3220   const LLT S32 = LLT::scalar(32);
3221 
3222   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3223     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3224 
3225   unsigned ImmOffset = TotalConstOffset;
3226 
3227   // If the immediate value is too big for the immoffset field, put the value
3228   // and -4096 into the immoffset field so that the value that is copied/added
3229   // for the voffset field is a multiple of 4096, and it stands more chance
3230   // of being CSEd with the copy/add for another similar load/store.
3231   // However, do not do that rounding down to a multiple of 4096 if that is a
3232   // negative number, as it appears to be illegal to have a negative offset
3233   // in the vgpr, even if adding the immediate offset makes it positive.
3234   unsigned Overflow = ImmOffset & ~MaxImm;
3235   ImmOffset -= Overflow;
3236   if ((int32_t)Overflow < 0) {
3237     Overflow += ImmOffset;
3238     ImmOffset = 0;
3239   }
3240 
3241   if (Overflow != 0) {
3242     if (!BaseReg) {
3243       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3244     } else {
3245       auto OverflowVal = B.buildConstant(S32, Overflow);
3246       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3247     }
3248   }
3249 
3250   if (!BaseReg)
3251     BaseReg = B.buildConstant(S32, 0).getReg(0);
3252 
3253   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3254 }
3255 
3256 /// Handle register layout difference for f16 images for some subtargets.
3257 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3258                                              MachineRegisterInfo &MRI,
3259                                              Register Reg) const {
3260   if (!ST.hasUnpackedD16VMem())
3261     return Reg;
3262 
3263   const LLT S16 = LLT::scalar(16);
3264   const LLT S32 = LLT::scalar(32);
3265   LLT StoreVT = MRI.getType(Reg);
3266   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3267 
3268   auto Unmerge = B.buildUnmerge(S16, Reg);
3269 
3270   SmallVector<Register, 4> WideRegs;
3271   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3272     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3273 
3274   int NumElts = StoreVT.getNumElements();
3275 
3276   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3277 }
3278 
3279 Register AMDGPULegalizerInfo::fixStoreSourceType(
3280   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3281   MachineRegisterInfo *MRI = B.getMRI();
3282   LLT Ty = MRI->getType(VData);
3283 
3284   const LLT S16 = LLT::scalar(16);
3285 
3286   // Fixup illegal register types for i8 stores.
3287   if (Ty == LLT::scalar(8) || Ty == S16) {
3288     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3289     return AnyExt;
3290   }
3291 
3292   if (Ty.isVector()) {
3293     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3294       if (IsFormat)
3295         return handleD16VData(B, *MRI, VData);
3296     }
3297   }
3298 
3299   return VData;
3300 }
3301 
3302 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3303                                               MachineRegisterInfo &MRI,
3304                                               MachineIRBuilder &B,
3305                                               bool IsTyped,
3306                                               bool IsFormat) const {
3307   Register VData = MI.getOperand(1).getReg();
3308   LLT Ty = MRI.getType(VData);
3309   LLT EltTy = Ty.getScalarType();
3310   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3311   const LLT S32 = LLT::scalar(32);
3312 
3313   VData = fixStoreSourceType(B, VData, IsFormat);
3314   Register RSrc = MI.getOperand(2).getReg();
3315 
3316   MachineMemOperand *MMO = *MI.memoperands_begin();
3317   const int MemSize = MMO->getSize();
3318 
3319   unsigned ImmOffset;
3320   unsigned TotalOffset;
3321 
3322   // The typed intrinsics add an immediate after the registers.
3323   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3324 
3325   // The struct intrinsic variants add one additional operand over raw.
3326   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3327   Register VIndex;
3328   int OpOffset = 0;
3329   if (HasVIndex) {
3330     VIndex = MI.getOperand(3).getReg();
3331     OpOffset = 1;
3332   }
3333 
3334   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3335   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3336 
3337   unsigned Format = 0;
3338   if (IsTyped) {
3339     Format = MI.getOperand(5 + OpOffset).getImm();
3340     ++OpOffset;
3341   }
3342 
3343   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3344 
3345   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3346   if (TotalOffset != 0)
3347     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3348 
3349   unsigned Opc;
3350   if (IsTyped) {
3351     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3352                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3353   } else if (IsFormat) {
3354     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3355                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3356   } else {
3357     switch (MemSize) {
3358     case 1:
3359       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3360       break;
3361     case 2:
3362       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3363       break;
3364     default:
3365       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3366       break;
3367     }
3368   }
3369 
3370   if (!VIndex)
3371     VIndex = B.buildConstant(S32, 0).getReg(0);
3372 
3373   auto MIB = B.buildInstr(Opc)
3374     .addUse(VData)              // vdata
3375     .addUse(RSrc)               // rsrc
3376     .addUse(VIndex)             // vindex
3377     .addUse(VOffset)            // voffset
3378     .addUse(SOffset)            // soffset
3379     .addImm(ImmOffset);         // offset(imm)
3380 
3381   if (IsTyped)
3382     MIB.addImm(Format);
3383 
3384   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3385      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3386      .addMemOperand(MMO);
3387 
3388   MI.eraseFromParent();
3389   return true;
3390 }
3391 
3392 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3393                                              MachineRegisterInfo &MRI,
3394                                              MachineIRBuilder &B,
3395                                              bool IsFormat,
3396                                              bool IsTyped) const {
3397   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3398   MachineMemOperand *MMO = *MI.memoperands_begin();
3399   const int MemSize = MMO->getSize();
3400   const LLT S32 = LLT::scalar(32);
3401 
3402   Register Dst = MI.getOperand(0).getReg();
3403   Register RSrc = MI.getOperand(2).getReg();
3404 
3405   // The typed intrinsics add an immediate after the registers.
3406   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3407 
3408   // The struct intrinsic variants add one additional operand over raw.
3409   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3410   Register VIndex;
3411   int OpOffset = 0;
3412   if (HasVIndex) {
3413     VIndex = MI.getOperand(3).getReg();
3414     OpOffset = 1;
3415   }
3416 
3417   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3418   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3419 
3420   unsigned Format = 0;
3421   if (IsTyped) {
3422     Format = MI.getOperand(5 + OpOffset).getImm();
3423     ++OpOffset;
3424   }
3425 
3426   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3427   unsigned ImmOffset;
3428   unsigned TotalOffset;
3429 
3430   LLT Ty = MRI.getType(Dst);
3431   LLT EltTy = Ty.getScalarType();
3432   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3433   const bool Unpacked = ST.hasUnpackedD16VMem();
3434 
3435   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3436   if (TotalOffset != 0)
3437     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3438 
3439   unsigned Opc;
3440 
3441   if (IsTyped) {
3442     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3443                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3444   } else if (IsFormat) {
3445     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3446                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3447   } else {
3448     switch (MemSize) {
3449     case 1:
3450       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3451       break;
3452     case 2:
3453       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3454       break;
3455     default:
3456       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3457       break;
3458     }
3459   }
3460 
3461   Register LoadDstReg;
3462 
3463   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3464   LLT UnpackedTy = Ty.changeElementSize(32);
3465 
3466   if (IsExtLoad)
3467     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3468   else if (Unpacked && IsD16 && Ty.isVector())
3469     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3470   else
3471     LoadDstReg = Dst;
3472 
3473   if (!VIndex)
3474     VIndex = B.buildConstant(S32, 0).getReg(0);
3475 
3476   auto MIB = B.buildInstr(Opc)
3477     .addDef(LoadDstReg)         // vdata
3478     .addUse(RSrc)               // rsrc
3479     .addUse(VIndex)             // vindex
3480     .addUse(VOffset)            // voffset
3481     .addUse(SOffset)            // soffset
3482     .addImm(ImmOffset);         // offset(imm)
3483 
3484   if (IsTyped)
3485     MIB.addImm(Format);
3486 
3487   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3488      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3489      .addMemOperand(MMO);
3490 
3491   if (LoadDstReg != Dst) {
3492     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3493 
3494     // Widen result for extending loads was widened.
3495     if (IsExtLoad)
3496       B.buildTrunc(Dst, LoadDstReg);
3497     else {
3498       // Repack to original 16-bit vector result
3499       // FIXME: G_TRUNC should work, but legalization currently fails
3500       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3501       SmallVector<Register, 4> Repack;
3502       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3503         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3504       B.buildMerge(Dst, Repack);
3505     }
3506   }
3507 
3508   MI.eraseFromParent();
3509   return true;
3510 }
3511 
3512 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3513                                                MachineIRBuilder &B,
3514                                                bool IsInc) const {
3515   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3516                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3517   B.buildInstr(Opc)
3518     .addDef(MI.getOperand(0).getReg())
3519     .addUse(MI.getOperand(2).getReg())
3520     .addUse(MI.getOperand(3).getReg())
3521     .cloneMemRefs(MI);
3522   MI.eraseFromParent();
3523   return true;
3524 }
3525 
3526 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3527   switch (IntrID) {
3528   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3529   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3530     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3531   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3532   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3533     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3534   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3535   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3536     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3537   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3538   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3539     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3540   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3541   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3542     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3543   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3544   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3545     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3546   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3547   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3548     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3549   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3550   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3551     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3552   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3553   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3554     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3555   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3556   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3557     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3558   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3559   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3560     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3561   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3562   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3563     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3564   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3565   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3566     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3567   default:
3568     llvm_unreachable("unhandled atomic opcode");
3569   }
3570 }
3571 
3572 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3573                                                MachineIRBuilder &B,
3574                                                Intrinsic::ID IID) const {
3575   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3576                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3577 
3578   Register Dst = MI.getOperand(0).getReg();
3579   Register VData = MI.getOperand(2).getReg();
3580 
3581   Register CmpVal;
3582   int OpOffset = 0;
3583 
3584   if (IsCmpSwap) {
3585     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3586     ++OpOffset;
3587   }
3588 
3589   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3590   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3591 
3592   // The struct intrinsic variants add one additional operand over raw.
3593   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3594   Register VIndex;
3595   if (HasVIndex) {
3596     VIndex = MI.getOperand(4 + OpOffset).getReg();
3597     ++OpOffset;
3598   }
3599 
3600   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3601   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3602   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3603 
3604   MachineMemOperand *MMO = *MI.memoperands_begin();
3605 
3606   unsigned ImmOffset;
3607   unsigned TotalOffset;
3608   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3609   if (TotalOffset != 0)
3610     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3611 
3612   if (!VIndex)
3613     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3614 
3615   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3616     .addDef(Dst)
3617     .addUse(VData); // vdata
3618 
3619   if (IsCmpSwap)
3620     MIB.addReg(CmpVal);
3621 
3622   MIB.addUse(RSrc)               // rsrc
3623      .addUse(VIndex)             // vindex
3624      .addUse(VOffset)            // voffset
3625      .addUse(SOffset)            // soffset
3626      .addImm(ImmOffset)          // offset(imm)
3627      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3628      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3629      .addMemOperand(MMO);
3630 
3631   MI.eraseFromParent();
3632   return true;
3633 }
3634 
3635 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3636 /// vector with s16 typed elements.
3637 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3638                                         SmallVectorImpl<Register> &PackedAddrs,
3639                                         int AddrIdx, int DimIdx, int EndIdx,
3640                                         int NumGradients) {
3641   const LLT S16 = LLT::scalar(16);
3642   const LLT V2S16 = LLT::vector(2, 16);
3643 
3644   for (int I = AddrIdx; I < EndIdx; ++I) {
3645     MachineOperand &SrcOp = MI.getOperand(I);
3646     if (!SrcOp.isReg())
3647       continue; // _L to _LZ may have eliminated this.
3648 
3649     Register AddrReg = SrcOp.getReg();
3650 
3651     if (I < DimIdx) {
3652       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3653       PackedAddrs.push_back(AddrReg);
3654     } else {
3655       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3656       // derivatives dx/dh and dx/dv are packed with undef.
3657       if (((I + 1) >= EndIdx) ||
3658           ((NumGradients / 2) % 2 == 1 &&
3659            (I == DimIdx + (NumGradients / 2) - 1 ||
3660             I == DimIdx + NumGradients - 1)) ||
3661           // Check for _L to _LZ optimization
3662           !MI.getOperand(I + 1).isReg()) {
3663         PackedAddrs.push_back(
3664             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3665                 .getReg(0));
3666       } else {
3667         PackedAddrs.push_back(
3668             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3669                 .getReg(0));
3670         ++I;
3671       }
3672     }
3673   }
3674 }
3675 
3676 /// Convert from separate vaddr components to a single vector address register,
3677 /// and replace the remaining operands with $noreg.
3678 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3679                                      int DimIdx, int NumVAddrs) {
3680   const LLT S32 = LLT::scalar(32);
3681 
3682   SmallVector<Register, 8> AddrRegs;
3683   for (int I = 0; I != NumVAddrs; ++I) {
3684     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3685     if (SrcOp.isReg()) {
3686       AddrRegs.push_back(SrcOp.getReg());
3687       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3688     }
3689   }
3690 
3691   int NumAddrRegs = AddrRegs.size();
3692   if (NumAddrRegs != 1) {
3693     // Round up to 8 elements for v5-v7
3694     // FIXME: Missing intermediate sized register classes and instructions.
3695     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3696       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3697       auto Undef = B.buildUndef(S32);
3698       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3699       NumAddrRegs = RoundedNumRegs;
3700     }
3701 
3702     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3703     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3704   }
3705 
3706   for (int I = 1; I != NumVAddrs; ++I) {
3707     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3708     if (SrcOp.isReg())
3709       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3710   }
3711 }
3712 
3713 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3714 ///
3715 /// Depending on the subtarget, load/store with 16-bit element data need to be
3716 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3717 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3718 /// registers.
3719 ///
3720 /// We don't want to directly select image instructions just yet, but also want
3721 /// to exposes all register repacking to the legalizer/combiners. We also don't
3722 /// want a selected instrution entering RegBankSelect. In order to avoid
3723 /// defining a multitude of intermediate image instructions, directly hack on
3724 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3725 /// now unnecessary arguments with $noreg.
3726 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3727     MachineInstr &MI, MachineIRBuilder &B,
3728     GISelChangeObserver &Observer,
3729     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3730 
3731   const int NumDefs = MI.getNumExplicitDefs();
3732   bool IsTFE = NumDefs == 2;
3733   // We are only processing the operands of d16 image operations on subtargets
3734   // that use the unpacked register layout, or need to repack the TFE result.
3735 
3736   // TODO: Do we need to guard against already legalized intrinsics?
3737   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3738     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3739 
3740   MachineRegisterInfo *MRI = B.getMRI();
3741   const LLT S32 = LLT::scalar(32);
3742   const LLT S16 = LLT::scalar(16);
3743   const LLT V2S16 = LLT::vector(2, 16);
3744 
3745   // Index of first address argument
3746   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3747 
3748   int NumVAddrs, NumGradients;
3749   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3750   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3751     getDMaskIdx(BaseOpcode, NumDefs);
3752   unsigned DMask = 0;
3753 
3754   // Check for 16 bit addresses and pack if true.
3755   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3756   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3757   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3758   const bool IsG16 = GradTy == S16;
3759   const bool IsA16 = AddrTy == S16;
3760 
3761   int DMaskLanes = 0;
3762   if (!BaseOpcode->Atomic) {
3763     DMask = MI.getOperand(DMaskIdx).getImm();
3764     if (BaseOpcode->Gather4) {
3765       DMaskLanes = 4;
3766     } else if (DMask != 0) {
3767       DMaskLanes = countPopulation(DMask);
3768     } else if (!IsTFE && !BaseOpcode->Store) {
3769       // If dmask is 0, this is a no-op load. This can be eliminated.
3770       B.buildUndef(MI.getOperand(0));
3771       MI.eraseFromParent();
3772       return true;
3773     }
3774   }
3775 
3776   Observer.changingInstr(MI);
3777   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3778 
3779   unsigned NewOpcode = NumDefs == 0 ?
3780     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3781 
3782   // Track that we legalized this
3783   MI.setDesc(B.getTII().get(NewOpcode));
3784 
3785   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3786   // dmask to be at least 1 otherwise the instruction will fail
3787   if (IsTFE && DMask == 0) {
3788     DMask = 0x1;
3789     DMaskLanes = 1;
3790     MI.getOperand(DMaskIdx).setImm(DMask);
3791   }
3792 
3793   if (BaseOpcode->Atomic) {
3794     Register VData0 = MI.getOperand(2).getReg();
3795     LLT Ty = MRI->getType(VData0);
3796 
3797     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3798     if (Ty.isVector())
3799       return false;
3800 
3801     if (BaseOpcode->AtomicX2) {
3802       Register VData1 = MI.getOperand(3).getReg();
3803       // The two values are packed in one register.
3804       LLT PackedTy = LLT::vector(2, Ty);
3805       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3806       MI.getOperand(2).setReg(Concat.getReg(0));
3807       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3808     }
3809   }
3810 
3811   int CorrectedNumVAddrs = NumVAddrs;
3812 
3813   // Optimize _L to _LZ when _L is zero
3814   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3815         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3816     const ConstantFP *ConstantLod;
3817     const int LodIdx = AddrIdx + NumVAddrs - 1;
3818 
3819     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3820       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3821         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3822         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3823           LZMappingInfo->LZ, ImageDimIntr->Dim);
3824 
3825         // The starting indexes should remain in the same place.
3826         --NumVAddrs;
3827         --CorrectedNumVAddrs;
3828 
3829         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3830           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3831         MI.RemoveOperand(LodIdx);
3832       }
3833     }
3834   }
3835 
3836   // Optimize _mip away, when 'lod' is zero
3837   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3838     int64_t ConstantLod;
3839     const int LodIdx = AddrIdx + NumVAddrs - 1;
3840 
3841     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3842       if (ConstantLod == 0) {
3843         // TODO: Change intrinsic opcode and remove operand instead or replacing
3844         // it with 0, as the _L to _LZ handling is done above.
3845         MI.getOperand(LodIdx).ChangeToImmediate(0);
3846         --CorrectedNumVAddrs;
3847       }
3848     }
3849   }
3850 
3851   // Rewrite the addressing register layout before doing anything else.
3852   if (IsA16 || IsG16) {
3853     if (IsA16) {
3854       // Target must support the feature and gradients need to be 16 bit too
3855       if (!ST.hasA16() || !IsG16)
3856         return false;
3857     } else if (!ST.hasG16())
3858       return false;
3859 
3860     if (NumVAddrs > 1) {
3861       SmallVector<Register, 4> PackedRegs;
3862       // Don't compress addresses for G16
3863       const int PackEndIdx =
3864           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3865       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3866                                   PackEndIdx, NumGradients);
3867 
3868       if (!IsA16) {
3869         // Add uncompressed address
3870         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3871           int AddrReg = MI.getOperand(I).getReg();
3872           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3873           PackedRegs.push_back(AddrReg);
3874         }
3875       }
3876 
3877       // See also below in the non-a16 branch
3878       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3879 
3880       if (!UseNSA && PackedRegs.size() > 1) {
3881         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3882         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3883         PackedRegs[0] = Concat.getReg(0);
3884         PackedRegs.resize(1);
3885       }
3886 
3887       const int NumPacked = PackedRegs.size();
3888       for (int I = 0; I != NumVAddrs; ++I) {
3889         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3890         if (!SrcOp.isReg()) {
3891           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3892           continue;
3893         }
3894 
3895         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3896 
3897         if (I < NumPacked)
3898           SrcOp.setReg(PackedRegs[I]);
3899         else
3900           SrcOp.setReg(AMDGPU::NoRegister);
3901       }
3902     }
3903   } else {
3904     // If the register allocator cannot place the address registers contiguously
3905     // without introducing moves, then using the non-sequential address encoding
3906     // is always preferable, since it saves VALU instructions and is usually a
3907     // wash in terms of code size or even better.
3908     //
3909     // However, we currently have no way of hinting to the register allocator
3910     // that MIMG addresses should be placed contiguously when it is possible to
3911     // do so, so force non-NSA for the common 2-address case as a heuristic.
3912     //
3913     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3914     // allocation when possible.
3915     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3916 
3917     if (!UseNSA && NumVAddrs > 1)
3918       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3919   }
3920 
3921   int Flags = 0;
3922   if (IsA16)
3923     Flags |= 1;
3924   if (IsG16)
3925     Flags |= 2;
3926   MI.addOperand(MachineOperand::CreateImm(Flags));
3927 
3928   if (BaseOpcode->Store) { // No TFE for stores?
3929     // TODO: Handle dmask trim
3930     Register VData = MI.getOperand(1).getReg();
3931     LLT Ty = MRI->getType(VData);
3932     if (!Ty.isVector() || Ty.getElementType() != S16)
3933       return true;
3934 
3935     Register RepackedReg = handleD16VData(B, *MRI, VData);
3936     if (RepackedReg != VData) {
3937       MI.getOperand(1).setReg(RepackedReg);
3938     }
3939 
3940     return true;
3941   }
3942 
3943   Register DstReg = MI.getOperand(0).getReg();
3944   LLT Ty = MRI->getType(DstReg);
3945   const LLT EltTy = Ty.getScalarType();
3946   const bool IsD16 = Ty.getScalarType() == S16;
3947   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3948 
3949   // Confirm that the return type is large enough for the dmask specified
3950   if (NumElts < DMaskLanes)
3951     return false;
3952 
3953   if (NumElts > 4 || DMaskLanes > 4)
3954     return false;
3955 
3956   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3957   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3958 
3959   // The raw dword aligned data component of the load. The only legal cases
3960   // where this matters should be when using the packed D16 format, for
3961   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3962   LLT RoundedTy;
3963 
3964   // S32 vector to to cover all data, plus TFE result element.
3965   LLT TFETy;
3966 
3967   // Register type to use for each loaded component. Will be S32 or V2S16.
3968   LLT RegTy;
3969 
3970   if (IsD16 && ST.hasUnpackedD16VMem()) {
3971     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3972     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3973     RegTy = S32;
3974   } else {
3975     unsigned EltSize = EltTy.getSizeInBits();
3976     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3977     unsigned RoundedSize = 32 * RoundedElts;
3978     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3979     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3980     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3981   }
3982 
3983   // The return type does not need adjustment.
3984   // TODO: Should we change s16 case to s32 or <2 x s16>?
3985   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3986     return true;
3987 
3988   Register Dst1Reg;
3989 
3990   // Insert after the instruction.
3991   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3992 
3993   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3994   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3995   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3996   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3997 
3998   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3999 
4000   MI.getOperand(0).setReg(NewResultReg);
4001 
4002   // In the IR, TFE is supposed to be used with a 2 element struct return
4003   // type. The intruction really returns these two values in one contiguous
4004   // register, with one additional dword beyond the loaded data. Rewrite the
4005   // return type to use a single register result.
4006 
4007   if (IsTFE) {
4008     Dst1Reg = MI.getOperand(1).getReg();
4009     if (MRI->getType(Dst1Reg) != S32)
4010       return false;
4011 
4012     // TODO: Make sure the TFE operand bit is set.
4013     MI.RemoveOperand(1);
4014 
4015     // Handle the easy case that requires no repack instructions.
4016     if (Ty == S32) {
4017       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4018       return true;
4019     }
4020   }
4021 
4022   // Now figure out how to copy the new result register back into the old
4023   // result.
4024   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4025 
4026   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4027 
4028   if (ResultNumRegs == 1) {
4029     assert(!IsTFE);
4030     ResultRegs[0] = NewResultReg;
4031   } else {
4032     // We have to repack into a new vector of some kind.
4033     for (int I = 0; I != NumDataRegs; ++I)
4034       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4035     B.buildUnmerge(ResultRegs, NewResultReg);
4036 
4037     // Drop the final TFE element to get the data part. The TFE result is
4038     // directly written to the right place already.
4039     if (IsTFE)
4040       ResultRegs.resize(NumDataRegs);
4041   }
4042 
4043   // For an s16 scalar result, we form an s32 result with a truncate regardless
4044   // of packed vs. unpacked.
4045   if (IsD16 && !Ty.isVector()) {
4046     B.buildTrunc(DstReg, ResultRegs[0]);
4047     return true;
4048   }
4049 
4050   // Avoid a build/concat_vector of 1 entry.
4051   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4052     B.buildBitcast(DstReg, ResultRegs[0]);
4053     return true;
4054   }
4055 
4056   assert(Ty.isVector());
4057 
4058   if (IsD16) {
4059     // For packed D16 results with TFE enabled, all the data components are
4060     // S32. Cast back to the expected type.
4061     //
4062     // TODO: We don't really need to use load s32 elements. We would only need one
4063     // cast for the TFE result if a multiple of v2s16 was used.
4064     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4065       for (Register &Reg : ResultRegs)
4066         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4067     } else if (ST.hasUnpackedD16VMem()) {
4068       for (Register &Reg : ResultRegs)
4069         Reg = B.buildTrunc(S16, Reg).getReg(0);
4070     }
4071   }
4072 
4073   auto padWithUndef = [&](LLT Ty, int NumElts) {
4074     if (NumElts == 0)
4075       return;
4076     Register Undef = B.buildUndef(Ty).getReg(0);
4077     for (int I = 0; I != NumElts; ++I)
4078       ResultRegs.push_back(Undef);
4079   };
4080 
4081   // Pad out any elements eliminated due to the dmask.
4082   LLT ResTy = MRI->getType(ResultRegs[0]);
4083   if (!ResTy.isVector()) {
4084     padWithUndef(ResTy, NumElts - ResultRegs.size());
4085     B.buildBuildVector(DstReg, ResultRegs);
4086     return true;
4087   }
4088 
4089   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4090   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4091 
4092   // Deal with the one annoying legal case.
4093   const LLT V3S16 = LLT::vector(3, 16);
4094   if (Ty == V3S16) {
4095     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4096     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4097     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4098     return true;
4099   }
4100 
4101   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4102   B.buildConcatVectors(DstReg, ResultRegs);
4103   return true;
4104 }
4105 
4106 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4107   MachineInstr &MI, MachineIRBuilder &B,
4108   GISelChangeObserver &Observer) const {
4109   Register Dst = MI.getOperand(0).getReg();
4110   LLT Ty = B.getMRI()->getType(Dst);
4111   unsigned Size = Ty.getSizeInBits();
4112   MachineFunction &MF = B.getMF();
4113 
4114   Observer.changingInstr(MI);
4115 
4116   // FIXME: We don't really need this intermediate instruction. The intrinsic
4117   // should be fixed to have a memory operand. Since it's readnone, we're not
4118   // allowed to add one.
4119   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4120   MI.RemoveOperand(1); // Remove intrinsic ID
4121 
4122   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4123   // TODO: Should this use datalayout alignment?
4124   const unsigned MemSize = (Size + 7) / 8;
4125   const Align MemAlign(4);
4126   MachineMemOperand *MMO = MF.getMachineMemOperand(
4127       MachinePointerInfo(),
4128       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4129           MachineMemOperand::MOInvariant,
4130       MemSize, MemAlign);
4131   MI.addMemOperand(MF, MMO);
4132 
4133   // There are no 96-bit result scalar loads, but widening to 128-bit should
4134   // always be legal. We may need to restore this to a 96-bit result if it turns
4135   // out this needs to be converted to a vector load during RegBankSelect.
4136   if (!isPowerOf2_32(Size)) {
4137     LegalizerHelper Helper(MF, *this, Observer, B);
4138 
4139     if (Ty.isVector())
4140       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4141     else
4142       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4143   }
4144 
4145   Observer.changedInstr(MI);
4146   return true;
4147 }
4148 
4149 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4150                                                 MachineRegisterInfo &MRI,
4151                                                 MachineIRBuilder &B) const {
4152   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4153   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4154       !ST.isTrapHandlerEnabled()) {
4155     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4156   } else {
4157     // Pass queue pointer to trap handler as input, and insert trap instruction
4158     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4159     MachineRegisterInfo &MRI = *B.getMRI();
4160     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4161     Register LiveIn = getLiveInRegister(
4162         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4163         /*InsertLiveInCopy=*/false);
4164     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4165       return false;
4166     B.buildCopy(SGPR01, LiveIn);
4167     B.buildInstr(AMDGPU::S_TRAP)
4168         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4169         .addReg(SGPR01, RegState::Implicit);
4170   }
4171 
4172   MI.eraseFromParent();
4173   return true;
4174 }
4175 
4176 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4177     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4178   // Is non-HSA path or trap-handler disabled? then, report a warning
4179   // accordingly
4180   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4181       !ST.isTrapHandlerEnabled()) {
4182     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4183                                      "debugtrap handler not supported",
4184                                      MI.getDebugLoc(), DS_Warning);
4185     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4186     Ctx.diagnose(NoTrap);
4187   } else {
4188     // Insert debug-trap instruction
4189     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4190   }
4191 
4192   MI.eraseFromParent();
4193   return true;
4194 }
4195 
4196 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4197                                             MachineInstr &MI) const {
4198   MachineIRBuilder &B = Helper.MIRBuilder;
4199   MachineRegisterInfo &MRI = *B.getMRI();
4200 
4201   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4202   auto IntrID = MI.getIntrinsicID();
4203   switch (IntrID) {
4204   case Intrinsic::amdgcn_if:
4205   case Intrinsic::amdgcn_else: {
4206     MachineInstr *Br = nullptr;
4207     MachineBasicBlock *UncondBrTarget = nullptr;
4208     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4209       const SIRegisterInfo *TRI
4210         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4211 
4212       Register Def = MI.getOperand(1).getReg();
4213       Register Use = MI.getOperand(3).getReg();
4214 
4215       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4216       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4217       if (IntrID == Intrinsic::amdgcn_if) {
4218         B.buildInstr(AMDGPU::SI_IF)
4219           .addDef(Def)
4220           .addUse(Use)
4221           .addMBB(UncondBrTarget);
4222       } else {
4223         B.buildInstr(AMDGPU::SI_ELSE)
4224           .addDef(Def)
4225           .addUse(Use)
4226           .addMBB(UncondBrTarget)
4227           .addImm(0);
4228       }
4229 
4230       if (Br) {
4231         Br->getOperand(0).setMBB(CondBrTarget);
4232       } else {
4233         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4234         // since we're swapping branch targets it needs to be reinserted.
4235         // FIXME: IRTranslator should probably not do this
4236         B.buildBr(*CondBrTarget);
4237       }
4238 
4239       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4240       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4241       MI.eraseFromParent();
4242       BrCond->eraseFromParent();
4243       return true;
4244     }
4245 
4246     return false;
4247   }
4248   case Intrinsic::amdgcn_loop: {
4249     MachineInstr *Br = nullptr;
4250     MachineBasicBlock *UncondBrTarget = nullptr;
4251     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4252       const SIRegisterInfo *TRI
4253         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4254 
4255       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4256       Register Reg = MI.getOperand(2).getReg();
4257 
4258       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4259       B.buildInstr(AMDGPU::SI_LOOP)
4260         .addUse(Reg)
4261         .addMBB(UncondBrTarget);
4262 
4263       if (Br)
4264         Br->getOperand(0).setMBB(CondBrTarget);
4265       else
4266         B.buildBr(*CondBrTarget);
4267 
4268       MI.eraseFromParent();
4269       BrCond->eraseFromParent();
4270       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4271       return true;
4272     }
4273 
4274     return false;
4275   }
4276   case Intrinsic::amdgcn_kernarg_segment_ptr:
4277     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4278       // This only makes sense to call in a kernel, so just lower to null.
4279       B.buildConstant(MI.getOperand(0).getReg(), 0);
4280       MI.eraseFromParent();
4281       return true;
4282     }
4283 
4284     return legalizePreloadedArgIntrin(
4285       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4286   case Intrinsic::amdgcn_implicitarg_ptr:
4287     return legalizeImplicitArgPtr(MI, MRI, B);
4288   case Intrinsic::amdgcn_workitem_id_x:
4289     return legalizePreloadedArgIntrin(MI, MRI, B,
4290                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4291   case Intrinsic::amdgcn_workitem_id_y:
4292     return legalizePreloadedArgIntrin(MI, MRI, B,
4293                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4294   case Intrinsic::amdgcn_workitem_id_z:
4295     return legalizePreloadedArgIntrin(MI, MRI, B,
4296                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4297   case Intrinsic::amdgcn_workgroup_id_x:
4298     return legalizePreloadedArgIntrin(MI, MRI, B,
4299                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4300   case Intrinsic::amdgcn_workgroup_id_y:
4301     return legalizePreloadedArgIntrin(MI, MRI, B,
4302                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4303   case Intrinsic::amdgcn_workgroup_id_z:
4304     return legalizePreloadedArgIntrin(MI, MRI, B,
4305                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4306   case Intrinsic::amdgcn_dispatch_ptr:
4307     return legalizePreloadedArgIntrin(MI, MRI, B,
4308                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4309   case Intrinsic::amdgcn_queue_ptr:
4310     return legalizePreloadedArgIntrin(MI, MRI, B,
4311                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4312   case Intrinsic::amdgcn_implicit_buffer_ptr:
4313     return legalizePreloadedArgIntrin(
4314       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4315   case Intrinsic::amdgcn_dispatch_id:
4316     return legalizePreloadedArgIntrin(MI, MRI, B,
4317                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4318   case Intrinsic::amdgcn_fdiv_fast:
4319     return legalizeFDIVFastIntrin(MI, MRI, B);
4320   case Intrinsic::amdgcn_is_shared:
4321     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4322   case Intrinsic::amdgcn_is_private:
4323     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4324   case Intrinsic::amdgcn_wavefrontsize: {
4325     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4326     MI.eraseFromParent();
4327     return true;
4328   }
4329   case Intrinsic::amdgcn_s_buffer_load:
4330     return legalizeSBufferLoad(MI, B, Helper.Observer);
4331   case Intrinsic::amdgcn_raw_buffer_store:
4332   case Intrinsic::amdgcn_struct_buffer_store:
4333     return legalizeBufferStore(MI, MRI, B, false, false);
4334   case Intrinsic::amdgcn_raw_buffer_store_format:
4335   case Intrinsic::amdgcn_struct_buffer_store_format:
4336     return legalizeBufferStore(MI, MRI, B, false, true);
4337   case Intrinsic::amdgcn_raw_tbuffer_store:
4338   case Intrinsic::amdgcn_struct_tbuffer_store:
4339     return legalizeBufferStore(MI, MRI, B, true, true);
4340   case Intrinsic::amdgcn_raw_buffer_load:
4341   case Intrinsic::amdgcn_struct_buffer_load:
4342     return legalizeBufferLoad(MI, MRI, B, false, false);
4343   case Intrinsic::amdgcn_raw_buffer_load_format:
4344   case Intrinsic::amdgcn_struct_buffer_load_format:
4345     return legalizeBufferLoad(MI, MRI, B, true, false);
4346   case Intrinsic::amdgcn_raw_tbuffer_load:
4347   case Intrinsic::amdgcn_struct_tbuffer_load:
4348     return legalizeBufferLoad(MI, MRI, B, true, true);
4349   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4350   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4351   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4352   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4353   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4354   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4355   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4356   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4357   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4358   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4359   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4360   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4361   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4362   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4363   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4364   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4365   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4366   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4367   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4368   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4369   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4370   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4371   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4372   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4373   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4374   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4375     return legalizeBufferAtomic(MI, B, IntrID);
4376   case Intrinsic::amdgcn_atomic_inc:
4377     return legalizeAtomicIncDec(MI, B, true);
4378   case Intrinsic::amdgcn_atomic_dec:
4379     return legalizeAtomicIncDec(MI, B, false);
4380   case Intrinsic::trap:
4381     return legalizeTrapIntrinsic(MI, MRI, B);
4382   case Intrinsic::debugtrap:
4383     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4384   default: {
4385     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4386             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4387       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4388     return true;
4389   }
4390   }
4391 
4392   return true;
4393 }
4394