1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .legalIf(isPointer(0))
419     .clampScalar(0, S32, S256)
420     .widenScalarToNextPow2(0, 32)
421     .clampMaxNumElements(0, S32, 16)
422     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
423     .scalarize(0);
424 
425   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
426     // Full set of gfx9 features.
427     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
428       .legalFor({S32, S16, V2S16})
429       .clampScalar(0, S16, S32)
430       .clampMaxNumElements(0, S16, 2)
431       .scalarize(0)
432       .widenScalarToNextPow2(0, 32);
433 
434     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
435       .legalFor({S32, S16, V2S16}) // Clamp modifier
436       .minScalar(0, S16)
437       .clampMaxNumElements(0, S16, 2)
438       .scalarize(0)
439       .widenScalarToNextPow2(0, 32)
440       .lower();
441   } else if (ST.has16BitInsts()) {
442     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
443       .legalFor({S32, S16})
444       .clampScalar(0, S16, S32)
445       .scalarize(0)
446       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
447 
448     // Technically the saturating operations require clamp bit support, but this
449     // was introduced at the same time as 16-bit operations.
450     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
451       .legalFor({S32, S16}) // Clamp modifier
452       .minScalar(0, S16)
453       .scalarize(0)
454       .widenScalarToNextPow2(0, 16)
455       .lower();
456 
457     // We're just lowering this, but it helps get a better result to try to
458     // coerce to the desired type first.
459     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
460       .minScalar(0, S16)
461       .scalarize(0)
462       .lower();
463   } else {
464     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
465       .legalFor({S32})
466       .clampScalar(0, S32, S32)
467       .scalarize(0);
468 
469     if (ST.hasIntClamp()) {
470       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
471         .legalFor({S32}) // Clamp modifier.
472         .scalarize(0)
473         .minScalarOrElt(0, S32)
474         .lower();
475     } else {
476       // Clamp bit support was added in VI, along with 16-bit operations.
477       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
478         .minScalar(0, S32)
479         .scalarize(0)
480         .lower();
481     }
482 
483     // FIXME: DAG expansion gets better results. The widening uses the smaller
484     // range values and goes for the min/max lowering directly.
485     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
486       .minScalar(0, S32)
487       .scalarize(0)
488       .lower();
489   }
490 
491   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
492     .customFor({S32, S64})
493     .clampScalar(0, S32, S64)
494     .widenScalarToNextPow2(0, 32)
495     .scalarize(0);
496 
497   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
498     .legalFor({S32})
499     .clampScalar(0, S32, S32)
500     .scalarize(0);
501 
502   // Report legal for any types we can handle anywhere. For the cases only legal
503   // on the SALU, RegBankSelect will be able to re-legalize.
504   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
505     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
506     .clampScalar(0, S32, S64)
507     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
508     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
509     .widenScalarToNextPow2(0)
510     .scalarize(0);
511 
512   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
513                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
514     .legalFor({{S32, S1}, {S32, S32}})
515     .minScalar(0, S32)
516     // TODO: .scalarize(0)
517     .lower();
518 
519   getActionDefinitionsBuilder(G_BITCAST)
520     // Don't worry about the size constraint.
521     .legalIf(all(isRegisterType(0), isRegisterType(1)))
522     .lower();
523 
524 
525   getActionDefinitionsBuilder(G_CONSTANT)
526     .legalFor({S1, S32, S64, S16, GlobalPtr,
527                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
528     .legalIf(isPointer(0))
529     .clampScalar(0, S32, S64)
530     .widenScalarToNextPow2(0);
531 
532   getActionDefinitionsBuilder(G_FCONSTANT)
533     .legalFor({S32, S64, S16})
534     .clampScalar(0, S16, S64);
535 
536   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
537       .legalIf(isRegisterType(0))
538       // s1 and s16 are special cases because they have legal operations on
539       // them, but don't really occupy registers in the normal way.
540       .legalFor({S1, S16})
541       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
542       .clampScalarOrElt(0, S32, MaxScalar)
543       .widenScalarToNextPow2(0, 32)
544       .clampMaxNumElements(0, S32, 16);
545 
546   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
547 
548   // If the amount is divergent, we have to do a wave reduction to get the
549   // maximum value, so this is expanded during RegBankSelect.
550   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
551     .legalFor({{PrivatePtr, S32}});
552 
553   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
554     .customIf(typeIsNot(0, PrivatePtr));
555 
556   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
557 
558   auto &FPOpActions = getActionDefinitionsBuilder(
559     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
560     .legalFor({S32, S64});
561   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
562     .customFor({S32, S64});
563   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
564     .customFor({S32, S64});
565 
566   if (ST.has16BitInsts()) {
567     if (ST.hasVOP3PInsts())
568       FPOpActions.legalFor({S16, V2S16});
569     else
570       FPOpActions.legalFor({S16});
571 
572     TrigActions.customFor({S16});
573     FDIVActions.customFor({S16});
574   }
575 
576   auto &MinNumMaxNum = getActionDefinitionsBuilder({
577       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
578 
579   if (ST.hasVOP3PInsts()) {
580     MinNumMaxNum.customFor(FPTypesPK16)
581       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
582       .clampMaxNumElements(0, S16, 2)
583       .clampScalar(0, S16, S64)
584       .scalarize(0);
585   } else if (ST.has16BitInsts()) {
586     MinNumMaxNum.customFor(FPTypes16)
587       .clampScalar(0, S16, S64)
588       .scalarize(0);
589   } else {
590     MinNumMaxNum.customFor(FPTypesBase)
591       .clampScalar(0, S32, S64)
592       .scalarize(0);
593   }
594 
595   if (ST.hasVOP3PInsts())
596     FPOpActions.clampMaxNumElements(0, S16, 2);
597 
598   FPOpActions
599     .scalarize(0)
600     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
601 
602   TrigActions
603     .scalarize(0)
604     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
605 
606   FDIVActions
607     .scalarize(0)
608     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
609 
610   getActionDefinitionsBuilder({G_FNEG, G_FABS})
611     .legalFor(FPTypesPK16)
612     .clampMaxNumElements(0, S16, 2)
613     .scalarize(0)
614     .clampScalar(0, S16, S64);
615 
616   if (ST.has16BitInsts()) {
617     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
618       .legalFor({S32, S64, S16})
619       .scalarize(0)
620       .clampScalar(0, S16, S64);
621   } else {
622     getActionDefinitionsBuilder(G_FSQRT)
623       .legalFor({S32, S64})
624       .scalarize(0)
625       .clampScalar(0, S32, S64);
626 
627     if (ST.hasFractBug()) {
628       getActionDefinitionsBuilder(G_FFLOOR)
629         .customFor({S64})
630         .legalFor({S32, S64})
631         .scalarize(0)
632         .clampScalar(0, S32, S64);
633     } else {
634       getActionDefinitionsBuilder(G_FFLOOR)
635         .legalFor({S32, S64})
636         .scalarize(0)
637         .clampScalar(0, S32, S64);
638     }
639   }
640 
641   getActionDefinitionsBuilder(G_FPTRUNC)
642     .legalFor({{S32, S64}, {S16, S32}})
643     .scalarize(0)
644     .lower();
645 
646   getActionDefinitionsBuilder(G_FPEXT)
647     .legalFor({{S64, S32}, {S32, S16}})
648     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
649     .scalarize(0);
650 
651   getActionDefinitionsBuilder(G_FSUB)
652       // Use actual fsub instruction
653       .legalFor({S32})
654       // Must use fadd + fneg
655       .lowerFor({S64, S16, V2S16})
656       .scalarize(0)
657       .clampScalar(0, S32, S64);
658 
659   // Whether this is legal depends on the floating point mode for the function.
660   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
661   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
662     FMad.customFor({S32, S16});
663   else if (ST.hasMadMacF32Insts())
664     FMad.customFor({S32});
665   else if (ST.hasMadF16())
666     FMad.customFor({S16});
667   FMad.scalarize(0)
668       .lower();
669 
670   // TODO: Do we need to clamp maximum bitwidth?
671   getActionDefinitionsBuilder(G_TRUNC)
672     .legalIf(isScalar(0))
673     .legalFor({{V2S16, V2S32}})
674     .clampMaxNumElements(0, S16, 2)
675     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
676     // situations (like an invalid implicit use), we don't want to infinite loop
677     // in the legalizer.
678     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
679     .alwaysLegal();
680 
681   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
682     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
683                {S32, S1}, {S64, S1}, {S16, S1}})
684     .scalarize(0)
685     .clampScalar(0, S32, S64)
686     .widenScalarToNextPow2(1, 32);
687 
688   // TODO: Split s1->s64 during regbankselect for VALU.
689   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
690     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
691     .lowerFor({{S32, S64}})
692     .lowerIf(typeIs(1, S1))
693     .customFor({{S64, S64}});
694   if (ST.has16BitInsts())
695     IToFP.legalFor({{S16, S16}});
696   IToFP.clampScalar(1, S32, S64)
697        .minScalar(0, S32)
698        .scalarize(0)
699        .widenScalarToNextPow2(1);
700 
701   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
702     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
703     .customFor({{S64, S64}})
704     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
705   if (ST.has16BitInsts())
706     FPToI.legalFor({{S16, S16}});
707   else
708     FPToI.minScalar(1, S32);
709 
710   FPToI.minScalar(0, S32)
711        .scalarize(0)
712        .lower();
713 
714   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
715     .scalarize(0)
716     .lower();
717 
718   if (ST.has16BitInsts()) {
719     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
720       .legalFor({S16, S32, S64})
721       .clampScalar(0, S16, S64)
722       .scalarize(0);
723   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
724     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
725       .legalFor({S32, S64})
726       .clampScalar(0, S32, S64)
727       .scalarize(0);
728   } else {
729     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
730       .legalFor({S32})
731       .customFor({S64})
732       .clampScalar(0, S32, S64)
733       .scalarize(0);
734   }
735 
736   getActionDefinitionsBuilder(G_PTR_ADD)
737     .legalIf(all(isPointer(0), sameSize(0, 1)))
738     .scalarize(0)
739     .scalarSameSizeAs(1, 0);
740 
741   getActionDefinitionsBuilder(G_PTRMASK)
742     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
743     .scalarSameSizeAs(1, 0)
744     .scalarize(0);
745 
746   auto &CmpBuilder =
747     getActionDefinitionsBuilder(G_ICMP)
748     // The compare output type differs based on the register bank of the output,
749     // so make both s1 and s32 legal.
750     //
751     // Scalar compares producing output in scc will be promoted to s32, as that
752     // is the allocatable register type that will be needed for the copy from
753     // scc. This will be promoted during RegBankSelect, and we assume something
754     // before that won't try to use s32 result types.
755     //
756     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
757     // bank.
758     .legalForCartesianProduct(
759       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
760     .legalForCartesianProduct(
761       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
762   if (ST.has16BitInsts()) {
763     CmpBuilder.legalFor({{S1, S16}});
764   }
765 
766   CmpBuilder
767     .widenScalarToNextPow2(1)
768     .clampScalar(1, S32, S64)
769     .scalarize(0)
770     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
771 
772   getActionDefinitionsBuilder(G_FCMP)
773     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
774     .widenScalarToNextPow2(1)
775     .clampScalar(1, S32, S64)
776     .scalarize(0);
777 
778   // FIXME: fpow has a selection pattern that should move to custom lowering.
779   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
780   if (ST.has16BitInsts())
781     Exp2Ops.legalFor({S32, S16});
782   else
783     Exp2Ops.legalFor({S32});
784   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
785   Exp2Ops.scalarize(0);
786 
787   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
788   if (ST.has16BitInsts())
789     ExpOps.customFor({{S32}, {S16}});
790   else
791     ExpOps.customFor({S32});
792   ExpOps.clampScalar(0, MinScalarFPTy, S32)
793         .scalarize(0);
794 
795   getActionDefinitionsBuilder(G_FPOWI)
796     .clampScalar(0, MinScalarFPTy, S32)
797     .lower();
798 
799   // The 64-bit versions produce 32-bit results, but only on the SALU.
800   getActionDefinitionsBuilder(G_CTPOP)
801     .legalFor({{S32, S32}, {S32, S64}})
802     .clampScalar(0, S32, S32)
803     .clampScalar(1, S32, S64)
804     .scalarize(0)
805     .widenScalarToNextPow2(0, 32)
806     .widenScalarToNextPow2(1, 32);
807 
808   // The hardware instructions return a different result on 0 than the generic
809   // instructions expect. The hardware produces -1, but these produce the
810   // bitwidth.
811   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
812     .scalarize(0)
813     .clampScalar(0, S32, S32)
814     .clampScalar(1, S32, S64)
815     .widenScalarToNextPow2(0, 32)
816     .widenScalarToNextPow2(1, 32)
817     .lower();
818 
819   // The 64-bit versions produce 32-bit results, but only on the SALU.
820   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
821     .legalFor({{S32, S32}, {S32, S64}})
822     .clampScalar(0, S32, S32)
823     .clampScalar(1, S32, S64)
824     .scalarize(0)
825     .widenScalarToNextPow2(0, 32)
826     .widenScalarToNextPow2(1, 32);
827 
828   getActionDefinitionsBuilder(G_BITREVERSE)
829     .legalFor({S32})
830     .clampScalar(0, S32, S32)
831     .scalarize(0);
832 
833   if (ST.has16BitInsts()) {
834     getActionDefinitionsBuilder(G_BSWAP)
835       .legalFor({S16, S32, V2S16})
836       .clampMaxNumElements(0, S16, 2)
837       // FIXME: Fixing non-power-of-2 before clamp is workaround for
838       // narrowScalar limitation.
839       .widenScalarToNextPow2(0)
840       .clampScalar(0, S16, S32)
841       .scalarize(0);
842 
843     if (ST.hasVOP3PInsts()) {
844       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
845         .legalFor({S32, S16, V2S16})
846         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
847         .clampMaxNumElements(0, S16, 2)
848         .minScalar(0, S16)
849         .widenScalarToNextPow2(0)
850         .scalarize(0)
851         .lower();
852     } else {
853       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
854         .legalFor({S32, S16})
855         .widenScalarToNextPow2(0)
856         .minScalar(0, S16)
857         .scalarize(0)
858         .lower();
859     }
860   } else {
861     // TODO: Should have same legality without v_perm_b32
862     getActionDefinitionsBuilder(G_BSWAP)
863       .legalFor({S32})
864       .lowerIf(scalarNarrowerThan(0, 32))
865       // FIXME: Fixing non-power-of-2 before clamp is workaround for
866       // narrowScalar limitation.
867       .widenScalarToNextPow2(0)
868       .maxScalar(0, S32)
869       .scalarize(0)
870       .lower();
871 
872     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
873       .legalFor({S32})
874       .minScalar(0, S32)
875       .widenScalarToNextPow2(0)
876       .scalarize(0)
877       .lower();
878   }
879 
880   getActionDefinitionsBuilder(G_INTTOPTR)
881     // List the common cases
882     .legalForCartesianProduct(AddrSpaces64, {S64})
883     .legalForCartesianProduct(AddrSpaces32, {S32})
884     .scalarize(0)
885     // Accept any address space as long as the size matches
886     .legalIf(sameSize(0, 1))
887     .widenScalarIf(smallerThan(1, 0),
888       [](const LegalityQuery &Query) {
889         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
890       })
891     .narrowScalarIf(largerThan(1, 0),
892       [](const LegalityQuery &Query) {
893         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
894       });
895 
896   getActionDefinitionsBuilder(G_PTRTOINT)
897     // List the common cases
898     .legalForCartesianProduct(AddrSpaces64, {S64})
899     .legalForCartesianProduct(AddrSpaces32, {S32})
900     .scalarize(0)
901     // Accept any address space as long as the size matches
902     .legalIf(sameSize(0, 1))
903     .widenScalarIf(smallerThan(0, 1),
904       [](const LegalityQuery &Query) {
905         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
906       })
907     .narrowScalarIf(
908       largerThan(0, 1),
909       [](const LegalityQuery &Query) {
910         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
911       });
912 
913   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
914     .scalarize(0)
915     .custom();
916 
917   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
918                                     bool IsLoad) -> bool {
919     const LLT DstTy = Query.Types[0];
920 
921     // Split vector extloads.
922     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
923     unsigned Align = Query.MMODescrs[0].AlignInBits;
924 
925     if (MemSize < DstTy.getSizeInBits())
926       MemSize = std::max(MemSize, Align);
927 
928     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
929       return true;
930 
931     const LLT PtrTy = Query.Types[1];
932     unsigned AS = PtrTy.getAddressSpace();
933     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
934       return true;
935 
936     // Catch weird sized loads that don't evenly divide into the access sizes
937     // TODO: May be able to widen depending on alignment etc.
938     unsigned NumRegs = (MemSize + 31) / 32;
939     if (NumRegs == 3) {
940       if (!ST.hasDwordx3LoadStores())
941         return true;
942     } else {
943       // If the alignment allows, these should have been widened.
944       if (!isPowerOf2_32(NumRegs))
945         return true;
946     }
947 
948     if (Align < MemSize) {
949       const SITargetLowering *TLI = ST.getTargetLowering();
950       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
951     }
952 
953     return false;
954   };
955 
956   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
957                                          unsigned Opc) -> bool {
958     unsigned Size = Query.Types[0].getSizeInBits();
959     if (isPowerOf2_32(Size))
960       return false;
961 
962     if (Size == 96 && ST.hasDwordx3LoadStores())
963       return false;
964 
965     unsigned AddrSpace = Query.Types[1].getAddressSpace();
966     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
967       return false;
968 
969     unsigned Align = Query.MMODescrs[0].AlignInBits;
970     unsigned RoundedSize = NextPowerOf2(Size);
971     return (Align >= RoundedSize);
972   };
973 
974   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
975   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
976   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
977 
978   // TODO: Refine based on subtargets which support unaligned access or 128-bit
979   // LDS
980   // TODO: Unsupported flat for SI.
981 
982   for (unsigned Op : {G_LOAD, G_STORE}) {
983     const bool IsStore = Op == G_STORE;
984 
985     auto &Actions = getActionDefinitionsBuilder(Op);
986     // Explicitly list some common cases.
987     // TODO: Does this help compile time at all?
988     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
989                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
990                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
991                                       {S64, GlobalPtr, 64, GlobalAlign32},
992                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
993                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
994                                       {S32, GlobalPtr, 8, GlobalAlign8},
995                                       {S32, GlobalPtr, 16, GlobalAlign16},
996 
997                                       {S32, LocalPtr, 32, 32},
998                                       {S64, LocalPtr, 64, 32},
999                                       {V2S32, LocalPtr, 64, 32},
1000                                       {S32, LocalPtr, 8, 8},
1001                                       {S32, LocalPtr, 16, 16},
1002                                       {V2S16, LocalPtr, 32, 32},
1003 
1004                                       {S32, PrivatePtr, 32, 32},
1005                                       {S32, PrivatePtr, 8, 8},
1006                                       {S32, PrivatePtr, 16, 16},
1007                                       {V2S16, PrivatePtr, 32, 32},
1008 
1009                                       {S32, ConstantPtr, 32, GlobalAlign32},
1010                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1011                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1012                                       {S64, ConstantPtr, 64, GlobalAlign32},
1013                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1014     Actions.legalIf(
1015       [=](const LegalityQuery &Query) -> bool {
1016         return isLoadStoreLegal(ST, Query, Op);
1017       });
1018 
1019     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1020     // 64-bits.
1021     //
1022     // TODO: Should generalize bitcast action into coerce, which will also cover
1023     // inserting addrspacecasts.
1024     Actions.customIf(typeIs(1, Constant32Ptr));
1025 
1026     // Turn any illegal element vectors into something easier to deal
1027     // with. These will ultimately produce 32-bit scalar shifts to extract the
1028     // parts anyway.
1029     //
1030     // For odd 16-bit element vectors, prefer to split those into pieces with
1031     // 16-bit vector parts.
1032     Actions.bitcastIf(
1033       [=](const LegalityQuery &Query) -> bool {
1034         const LLT Ty = Query.Types[0];
1035         const unsigned Size = Ty.getSizeInBits();
1036 
1037         if (Size != Query.MMODescrs[0].SizeInBits)
1038           return Size <= 32 && Ty.isVector();
1039 
1040         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
1041           return true;
1042         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
1043                !isRegisterVectorElementType(Ty.getElementType());
1044       }, bitcastToRegisterType(0));
1045 
1046     Actions
1047         .customIf(typeIs(1, Constant32Ptr))
1048         // Widen suitably aligned loads by loading extra elements.
1049         .moreElementsIf([=](const LegalityQuery &Query) {
1050             const LLT Ty = Query.Types[0];
1051             return Op == G_LOAD && Ty.isVector() &&
1052                    shouldWidenLoadResult(Query, Op);
1053           }, moreElementsToNextPow2(0))
1054         .widenScalarIf([=](const LegalityQuery &Query) {
1055             const LLT Ty = Query.Types[0];
1056             return Op == G_LOAD && !Ty.isVector() &&
1057                    shouldWidenLoadResult(Query, Op);
1058           }, widenScalarOrEltToNextPow2(0))
1059         .narrowScalarIf(
1060             [=](const LegalityQuery &Query) -> bool {
1061               return !Query.Types[0].isVector() &&
1062                      needToSplitMemOp(Query, Op == G_LOAD);
1063             },
1064             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1065               const LLT DstTy = Query.Types[0];
1066               const LLT PtrTy = Query.Types[1];
1067 
1068               const unsigned DstSize = DstTy.getSizeInBits();
1069               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1070 
1071               // Split extloads.
1072               if (DstSize > MemSize)
1073                 return std::make_pair(0, LLT::scalar(MemSize));
1074 
1075               if (!isPowerOf2_32(DstSize)) {
1076                 // We're probably decomposing an odd sized store. Try to split
1077                 // to the widest type. TODO: Account for alignment. As-is it
1078                 // should be OK, since the new parts will be further legalized.
1079                 unsigned FloorSize = PowerOf2Floor(DstSize);
1080                 return std::make_pair(0, LLT::scalar(FloorSize));
1081               }
1082 
1083               if (DstSize > 32 && (DstSize % 32 != 0)) {
1084                 // FIXME: Need a way to specify non-extload of larger size if
1085                 // suitably aligned.
1086                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1087               }
1088 
1089               unsigned MaxSize = maxSizeForAddrSpace(ST,
1090                                                      PtrTy.getAddressSpace(),
1091                                                      Op == G_LOAD);
1092               if (MemSize > MaxSize)
1093                 return std::make_pair(0, LLT::scalar(MaxSize));
1094 
1095               unsigned Align = Query.MMODescrs[0].AlignInBits;
1096               return std::make_pair(0, LLT::scalar(Align));
1097             })
1098         .fewerElementsIf(
1099             [=](const LegalityQuery &Query) -> bool {
1100               return Query.Types[0].isVector() &&
1101                      needToSplitMemOp(Query, Op == G_LOAD);
1102             },
1103             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1104               const LLT DstTy = Query.Types[0];
1105               const LLT PtrTy = Query.Types[1];
1106 
1107               LLT EltTy = DstTy.getElementType();
1108               unsigned MaxSize = maxSizeForAddrSpace(ST,
1109                                                      PtrTy.getAddressSpace(),
1110                                                      Op == G_LOAD);
1111 
1112               // FIXME: Handle widened to power of 2 results better. This ends
1113               // up scalarizing.
1114               // FIXME: 3 element stores scalarized on SI
1115 
1116               // Split if it's too large for the address space.
1117               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1118                 unsigned NumElts = DstTy.getNumElements();
1119                 unsigned EltSize = EltTy.getSizeInBits();
1120 
1121                 if (MaxSize % EltSize == 0) {
1122                   return std::make_pair(
1123                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1124                 }
1125 
1126                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1127 
1128                 // FIXME: Refine when odd breakdowns handled
1129                 // The scalars will need to be re-legalized.
1130                 if (NumPieces == 1 || NumPieces >= NumElts ||
1131                     NumElts % NumPieces != 0)
1132                   return std::make_pair(0, EltTy);
1133 
1134                 return std::make_pair(0,
1135                                       LLT::vector(NumElts / NumPieces, EltTy));
1136               }
1137 
1138               // FIXME: We could probably handle weird extending loads better.
1139               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1140               if (DstTy.getSizeInBits() > MemSize)
1141                 return std::make_pair(0, EltTy);
1142 
1143               unsigned EltSize = EltTy.getSizeInBits();
1144               unsigned DstSize = DstTy.getSizeInBits();
1145               if (!isPowerOf2_32(DstSize)) {
1146                 // We're probably decomposing an odd sized store. Try to split
1147                 // to the widest type. TODO: Account for alignment. As-is it
1148                 // should be OK, since the new parts will be further legalized.
1149                 unsigned FloorSize = PowerOf2Floor(DstSize);
1150                 return std::make_pair(
1151                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1152               }
1153 
1154               // Need to split because of alignment.
1155               unsigned Align = Query.MMODescrs[0].AlignInBits;
1156               if (EltSize > Align &&
1157                   (EltSize / Align < DstTy.getNumElements())) {
1158                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1159               }
1160 
1161               // May need relegalization for the scalars.
1162               return std::make_pair(0, EltTy);
1163             })
1164         .minScalar(0, S32);
1165 
1166     if (IsStore)
1167       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1168 
1169     // TODO: Need a bitcast lower option?
1170     Actions
1171         .widenScalarToNextPow2(0)
1172         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1173   }
1174 
1175   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1176                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1177                                                   {S32, GlobalPtr, 16, 2 * 8},
1178                                                   {S32, LocalPtr, 8, 8},
1179                                                   {S32, LocalPtr, 16, 16},
1180                                                   {S32, PrivatePtr, 8, 8},
1181                                                   {S32, PrivatePtr, 16, 16},
1182                                                   {S32, ConstantPtr, 8, 8},
1183                                                   {S32, ConstantPtr, 16, 2 * 8}});
1184   if (ST.hasFlatAddressSpace()) {
1185     ExtLoads.legalForTypesWithMemDesc(
1186         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1187   }
1188 
1189   ExtLoads.clampScalar(0, S32, S32)
1190           .widenScalarToNextPow2(0)
1191           .unsupportedIfMemSizeNotPow2()
1192           .lower();
1193 
1194   auto &Atomics = getActionDefinitionsBuilder(
1195     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1196      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1197      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1198      G_ATOMICRMW_UMIN})
1199     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1200                {S64, GlobalPtr}, {S64, LocalPtr},
1201                {S32, RegionPtr}, {S64, RegionPtr}});
1202   if (ST.hasFlatAddressSpace()) {
1203     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1204   }
1205 
1206   if (ST.hasLDSFPAtomics()) {
1207     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1208       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1209   }
1210 
1211   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1212   // demarshalling
1213   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1214     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1215                 {S32, FlatPtr}, {S64, FlatPtr}})
1216     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1217                {S32, RegionPtr}, {S64, RegionPtr}});
1218   // TODO: Pointer types, any 32-bit or 64-bit vector
1219 
1220   // Condition should be s32 for scalar, s1 for vector.
1221   getActionDefinitionsBuilder(G_SELECT)
1222     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1223           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1224           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1225     .clampScalar(0, S16, S64)
1226     .scalarize(1)
1227     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1228     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1229     .clampMaxNumElements(0, S32, 2)
1230     .clampMaxNumElements(0, LocalPtr, 2)
1231     .clampMaxNumElements(0, PrivatePtr, 2)
1232     .scalarize(0)
1233     .widenScalarToNextPow2(0)
1234     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1235 
1236   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1237   // be more flexible with the shift amount type.
1238   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1239     .legalFor({{S32, S32}, {S64, S32}});
1240   if (ST.has16BitInsts()) {
1241     if (ST.hasVOP3PInsts()) {
1242       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1243             .clampMaxNumElements(0, S16, 2);
1244     } else
1245       Shifts.legalFor({{S16, S16}});
1246 
1247     // TODO: Support 16-bit shift amounts for all types
1248     Shifts.widenScalarIf(
1249       [=](const LegalityQuery &Query) {
1250         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1251         // 32-bit amount.
1252         const LLT ValTy = Query.Types[0];
1253         const LLT AmountTy = Query.Types[1];
1254         return ValTy.getSizeInBits() <= 16 &&
1255                AmountTy.getSizeInBits() < 16;
1256       }, changeTo(1, S16));
1257     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1258     Shifts.clampScalar(1, S32, S32);
1259     Shifts.clampScalar(0, S16, S64);
1260     Shifts.widenScalarToNextPow2(0, 16);
1261   } else {
1262     // Make sure we legalize the shift amount type first, as the general
1263     // expansion for the shifted type will produce much worse code if it hasn't
1264     // been truncated already.
1265     Shifts.clampScalar(1, S32, S32);
1266     Shifts.clampScalar(0, S32, S64);
1267     Shifts.widenScalarToNextPow2(0, 32);
1268   }
1269   Shifts.scalarize(0);
1270 
1271   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1272     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1273     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1274     unsigned IdxTypeIdx = 2;
1275 
1276     getActionDefinitionsBuilder(Op)
1277       .customIf([=](const LegalityQuery &Query) {
1278           const LLT EltTy = Query.Types[EltTypeIdx];
1279           const LLT VecTy = Query.Types[VecTypeIdx];
1280           const LLT IdxTy = Query.Types[IdxTypeIdx];
1281           return (EltTy.getSizeInBits() == 16 ||
1282                   EltTy.getSizeInBits() % 32 == 0) &&
1283                  VecTy.getSizeInBits() % 32 == 0 &&
1284                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1285                  IdxTy.getSizeInBits() == 32;
1286         })
1287       .clampScalar(EltTypeIdx, S32, S64)
1288       .clampScalar(VecTypeIdx, S32, S64)
1289       .clampScalar(IdxTypeIdx, S32, S32);
1290   }
1291 
1292   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1293     .unsupportedIf([=](const LegalityQuery &Query) {
1294         const LLT &EltTy = Query.Types[1].getElementType();
1295         return Query.Types[0] != EltTy;
1296       });
1297 
1298   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1299     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1300     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1301 
1302     // FIXME: Doesn't handle extract of illegal sizes.
1303     getActionDefinitionsBuilder(Op)
1304       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1305       // FIXME: Multiples of 16 should not be legal.
1306       .legalIf([=](const LegalityQuery &Query) {
1307           const LLT BigTy = Query.Types[BigTyIdx];
1308           const LLT LitTy = Query.Types[LitTyIdx];
1309           return (BigTy.getSizeInBits() % 32 == 0) &&
1310                  (LitTy.getSizeInBits() % 16 == 0);
1311         })
1312       .widenScalarIf(
1313         [=](const LegalityQuery &Query) {
1314           const LLT BigTy = Query.Types[BigTyIdx];
1315           return (BigTy.getScalarSizeInBits() < 16);
1316         },
1317         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1318       .widenScalarIf(
1319         [=](const LegalityQuery &Query) {
1320           const LLT LitTy = Query.Types[LitTyIdx];
1321           return (LitTy.getScalarSizeInBits() < 16);
1322         },
1323         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1324       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1325       .widenScalarToNextPow2(BigTyIdx, 32);
1326 
1327   }
1328 
1329   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1330     .legalForCartesianProduct(AllS32Vectors, {S32})
1331     .legalForCartesianProduct(AllS64Vectors, {S64})
1332     .clampNumElements(0, V16S32, V32S32)
1333     .clampNumElements(0, V2S64, V16S64)
1334     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1335 
1336   if (ST.hasScalarPackInsts()) {
1337     BuildVector
1338       // FIXME: Should probably widen s1 vectors straight to s32
1339       .minScalarOrElt(0, S16)
1340       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1341       .minScalar(1, S32);
1342 
1343     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1344       .legalFor({V2S16, S32})
1345       .lower();
1346     BuildVector.minScalarOrElt(0, S32);
1347   } else {
1348     BuildVector.customFor({V2S16, S16});
1349     BuildVector.minScalarOrElt(0, S32);
1350 
1351     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1352       .customFor({V2S16, S32})
1353       .lower();
1354   }
1355 
1356   BuildVector.legalIf(isRegisterType(0));
1357 
1358   // FIXME: Clamp maximum size
1359   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1360     .legalIf(isRegisterType(0));
1361 
1362   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1363   // pre-legalize.
1364   if (ST.hasVOP3PInsts()) {
1365     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1366       .customFor({V2S16, V2S16})
1367       .lower();
1368   } else
1369     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1370 
1371   // Merge/Unmerge
1372   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1373     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1374     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1375 
1376     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1377       const LLT Ty = Query.Types[TypeIdx];
1378       if (Ty.isVector()) {
1379         const LLT &EltTy = Ty.getElementType();
1380         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1381           return true;
1382         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1383           return true;
1384       }
1385       return false;
1386     };
1387 
1388     auto &Builder = getActionDefinitionsBuilder(Op)
1389       .lowerFor({{S16, V2S16}})
1390       .lowerIf([=](const LegalityQuery &Query) {
1391           const LLT BigTy = Query.Types[BigTyIdx];
1392           return BigTy.getSizeInBits() == 32;
1393         })
1394       // Try to widen to s16 first for small types.
1395       // TODO: Only do this on targets with legal s16 shifts
1396       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1397       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1398       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1399       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1400                            elementTypeIs(1, S16)),
1401                        changeTo(1, V2S16))
1402       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1403       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1404       // valid.
1405       .clampScalar(LitTyIdx, S32, S512)
1406       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1407       // Break up vectors with weird elements into scalars
1408       .fewerElementsIf(
1409         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1410         scalarize(0))
1411       .fewerElementsIf(
1412         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1413         scalarize(1))
1414       .clampScalar(BigTyIdx, S32, MaxScalar);
1415 
1416     if (Op == G_MERGE_VALUES) {
1417       Builder.widenScalarIf(
1418         // TODO: Use 16-bit shifts if legal for 8-bit values?
1419         [=](const LegalityQuery &Query) {
1420           const LLT Ty = Query.Types[LitTyIdx];
1421           return Ty.getSizeInBits() < 32;
1422         },
1423         changeTo(LitTyIdx, S32));
1424     }
1425 
1426     Builder.widenScalarIf(
1427       [=](const LegalityQuery &Query) {
1428         const LLT Ty = Query.Types[BigTyIdx];
1429         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1430           Ty.getSizeInBits() % 16 != 0;
1431       },
1432       [=](const LegalityQuery &Query) {
1433         // Pick the next power of 2, or a multiple of 64 over 128.
1434         // Whichever is smaller.
1435         const LLT &Ty = Query.Types[BigTyIdx];
1436         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1437         if (NewSizeInBits >= 256) {
1438           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1439           if (RoundedTo < NewSizeInBits)
1440             NewSizeInBits = RoundedTo;
1441         }
1442         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1443       })
1444       .legalIf([=](const LegalityQuery &Query) {
1445           const LLT &BigTy = Query.Types[BigTyIdx];
1446           const LLT &LitTy = Query.Types[LitTyIdx];
1447 
1448           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1449             return false;
1450           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1451             return false;
1452 
1453           return BigTy.getSizeInBits() % 16 == 0 &&
1454                  LitTy.getSizeInBits() % 16 == 0 &&
1455                  BigTy.getSizeInBits() <= MaxRegisterSize;
1456         })
1457       // Any vectors left are the wrong size. Scalarize them.
1458       .scalarize(0)
1459       .scalarize(1);
1460   }
1461 
1462   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1463   // RegBankSelect.
1464   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1465     .legalFor({{S32}, {S64}});
1466 
1467   if (ST.hasVOP3PInsts()) {
1468     SextInReg.lowerFor({{V2S16}})
1469       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1470       // get more vector shift opportunities, since we'll get those when
1471       // expanded.
1472       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1473   } else if (ST.has16BitInsts()) {
1474     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1475   } else {
1476     // Prefer to promote to s32 before lowering if we don't have 16-bit
1477     // shifts. This avoid a lot of intermediate truncate and extend operations.
1478     SextInReg.lowerFor({{S32}, {S64}});
1479   }
1480 
1481   SextInReg
1482     .scalarize(0)
1483     .clampScalar(0, S32, S64)
1484     .lower();
1485 
1486   getActionDefinitionsBuilder(G_FSHR)
1487     .legalFor({{S32, S32}})
1488     .scalarize(0)
1489     .lower();
1490 
1491   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1492     .legalFor({S64});
1493 
1494   getActionDefinitionsBuilder(G_FENCE)
1495     .alwaysLegal();
1496 
1497   getActionDefinitionsBuilder({
1498       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1499       G_FCOPYSIGN,
1500 
1501       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1502       G_ATOMICRMW_NAND,
1503       G_ATOMICRMW_FSUB,
1504       G_READ_REGISTER,
1505       G_WRITE_REGISTER,
1506 
1507       G_SADDO, G_SSUBO,
1508 
1509        // TODO: Implement
1510       G_FMINIMUM, G_FMAXIMUM,
1511       G_FSHL
1512     }).lower();
1513 
1514   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1515         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1516         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1517     .unsupported();
1518 
1519   computeTables();
1520   verify(*ST.getInstrInfo());
1521 }
1522 
1523 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1524                                          MachineInstr &MI) const {
1525   MachineIRBuilder &B = Helper.MIRBuilder;
1526   MachineRegisterInfo &MRI = *B.getMRI();
1527   GISelChangeObserver &Observer = Helper.Observer;
1528 
1529   switch (MI.getOpcode()) {
1530   case TargetOpcode::G_ADDRSPACE_CAST:
1531     return legalizeAddrSpaceCast(MI, MRI, B);
1532   case TargetOpcode::G_FRINT:
1533     return legalizeFrint(MI, MRI, B);
1534   case TargetOpcode::G_FCEIL:
1535     return legalizeFceil(MI, MRI, B);
1536   case TargetOpcode::G_INTRINSIC_TRUNC:
1537     return legalizeIntrinsicTrunc(MI, MRI, B);
1538   case TargetOpcode::G_SITOFP:
1539     return legalizeITOFP(MI, MRI, B, true);
1540   case TargetOpcode::G_UITOFP:
1541     return legalizeITOFP(MI, MRI, B, false);
1542   case TargetOpcode::G_FPTOSI:
1543     return legalizeFPTOI(MI, MRI, B, true);
1544   case TargetOpcode::G_FPTOUI:
1545     return legalizeFPTOI(MI, MRI, B, false);
1546   case TargetOpcode::G_FMINNUM:
1547   case TargetOpcode::G_FMAXNUM:
1548   case TargetOpcode::G_FMINNUM_IEEE:
1549   case TargetOpcode::G_FMAXNUM_IEEE:
1550     return legalizeMinNumMaxNum(Helper, MI);
1551   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1552     return legalizeExtractVectorElt(MI, MRI, B);
1553   case TargetOpcode::G_INSERT_VECTOR_ELT:
1554     return legalizeInsertVectorElt(MI, MRI, B);
1555   case TargetOpcode::G_SHUFFLE_VECTOR:
1556     return legalizeShuffleVector(MI, MRI, B);
1557   case TargetOpcode::G_FSIN:
1558   case TargetOpcode::G_FCOS:
1559     return legalizeSinCos(MI, MRI, B);
1560   case TargetOpcode::G_GLOBAL_VALUE:
1561     return legalizeGlobalValue(MI, MRI, B);
1562   case TargetOpcode::G_LOAD:
1563     return legalizeLoad(MI, MRI, B, Observer);
1564   case TargetOpcode::G_FMAD:
1565     return legalizeFMad(MI, MRI, B);
1566   case TargetOpcode::G_FDIV:
1567     return legalizeFDIV(MI, MRI, B);
1568   case TargetOpcode::G_UDIV:
1569   case TargetOpcode::G_UREM:
1570     return legalizeUDIV_UREM(MI, MRI, B);
1571   case TargetOpcode::G_SDIV:
1572   case TargetOpcode::G_SREM:
1573     return legalizeSDIV_SREM(MI, MRI, B);
1574   case TargetOpcode::G_ATOMIC_CMPXCHG:
1575     return legalizeAtomicCmpXChg(MI, MRI, B);
1576   case TargetOpcode::G_FLOG:
1577     return legalizeFlog(MI, B, numbers::ln2f);
1578   case TargetOpcode::G_FLOG10:
1579     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1580   case TargetOpcode::G_FEXP:
1581     return legalizeFExp(MI, B);
1582   case TargetOpcode::G_FPOW:
1583     return legalizeFPow(MI, B);
1584   case TargetOpcode::G_FFLOOR:
1585     return legalizeFFloor(MI, MRI, B);
1586   case TargetOpcode::G_BUILD_VECTOR:
1587     return legalizeBuildVector(MI, MRI, B);
1588   default:
1589     return false;
1590   }
1591 
1592   llvm_unreachable("expected switch to return");
1593 }
1594 
1595 Register AMDGPULegalizerInfo::getSegmentAperture(
1596   unsigned AS,
1597   MachineRegisterInfo &MRI,
1598   MachineIRBuilder &B) const {
1599   MachineFunction &MF = B.getMF();
1600   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1601   const LLT S32 = LLT::scalar(32);
1602 
1603   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1604 
1605   if (ST.hasApertureRegs()) {
1606     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1607     // getreg.
1608     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1609         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1610         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1611     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1612         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1613         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1614     unsigned Encoding =
1615         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1616         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1617         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1618 
1619     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1620 
1621     B.buildInstr(AMDGPU::S_GETREG_B32)
1622       .addDef(GetReg)
1623       .addImm(Encoding);
1624     MRI.setType(GetReg, S32);
1625 
1626     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1627     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1628   }
1629 
1630   Register QueuePtr = MRI.createGenericVirtualRegister(
1631     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1632 
1633   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1634     return Register();
1635 
1636   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1637   // private_segment_aperture_base_hi.
1638   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1639 
1640   // TODO: can we be smarter about machine pointer info?
1641   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1642   MachineMemOperand *MMO = MF.getMachineMemOperand(
1643       PtrInfo,
1644       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1645           MachineMemOperand::MOInvariant,
1646       4, commonAlignment(Align(64), StructOffset));
1647 
1648   Register LoadAddr;
1649 
1650   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1651   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1652 }
1653 
1654 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1655   MachineInstr &MI, MachineRegisterInfo &MRI,
1656   MachineIRBuilder &B) const {
1657   MachineFunction &MF = B.getMF();
1658 
1659   const LLT S32 = LLT::scalar(32);
1660   Register Dst = MI.getOperand(0).getReg();
1661   Register Src = MI.getOperand(1).getReg();
1662 
1663   LLT DstTy = MRI.getType(Dst);
1664   LLT SrcTy = MRI.getType(Src);
1665   unsigned DestAS = DstTy.getAddressSpace();
1666   unsigned SrcAS = SrcTy.getAddressSpace();
1667 
1668   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1669   // vector element.
1670   assert(!DstTy.isVector());
1671 
1672   const AMDGPUTargetMachine &TM
1673     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1674 
1675   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1676   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1677     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1678     return true;
1679   }
1680 
1681   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1682     // Truncate.
1683     B.buildExtract(Dst, Src, 0);
1684     MI.eraseFromParent();
1685     return true;
1686   }
1687 
1688   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1689     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1690     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1691 
1692     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1693     // another. Merge operands are required to be the same type, but creating an
1694     // extra ptrtoint would be kind of pointless.
1695     auto HighAddr = B.buildConstant(
1696       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1697     B.buildMerge(Dst, {Src, HighAddr});
1698     MI.eraseFromParent();
1699     return true;
1700   }
1701 
1702   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1703     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1704            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1705     unsigned NullVal = TM.getNullPointerValue(DestAS);
1706 
1707     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1708     auto FlatNull = B.buildConstant(SrcTy, 0);
1709 
1710     // Extract low 32-bits of the pointer.
1711     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1712 
1713     auto CmpRes =
1714         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1715     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1716 
1717     MI.eraseFromParent();
1718     return true;
1719   }
1720 
1721   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1722     return false;
1723 
1724   if (!ST.hasFlatAddressSpace())
1725     return false;
1726 
1727   auto SegmentNull =
1728       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1729   auto FlatNull =
1730       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1731 
1732   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1733   if (!ApertureReg.isValid())
1734     return false;
1735 
1736   auto CmpRes =
1737       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1738 
1739   // Coerce the type of the low half of the result so we can use merge_values.
1740   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1741 
1742   // TODO: Should we allow mismatched types but matching sizes in merges to
1743   // avoid the ptrtoint?
1744   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1745   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1746 
1747   MI.eraseFromParent();
1748   return true;
1749 }
1750 
1751 bool AMDGPULegalizerInfo::legalizeFrint(
1752   MachineInstr &MI, MachineRegisterInfo &MRI,
1753   MachineIRBuilder &B) const {
1754   Register Src = MI.getOperand(1).getReg();
1755   LLT Ty = MRI.getType(Src);
1756   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1757 
1758   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1759   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1760 
1761   auto C1 = B.buildFConstant(Ty, C1Val);
1762   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1763 
1764   // TODO: Should this propagate fast-math-flags?
1765   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1766   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1767 
1768   auto C2 = B.buildFConstant(Ty, C2Val);
1769   auto Fabs = B.buildFAbs(Ty, Src);
1770 
1771   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1772   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1773   MI.eraseFromParent();
1774   return true;
1775 }
1776 
1777 bool AMDGPULegalizerInfo::legalizeFceil(
1778   MachineInstr &MI, MachineRegisterInfo &MRI,
1779   MachineIRBuilder &B) const {
1780 
1781   const LLT S1 = LLT::scalar(1);
1782   const LLT S64 = LLT::scalar(64);
1783 
1784   Register Src = MI.getOperand(1).getReg();
1785   assert(MRI.getType(Src) == S64);
1786 
1787   // result = trunc(src)
1788   // if (src > 0.0 && src != result)
1789   //   result += 1.0
1790 
1791   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1792 
1793   const auto Zero = B.buildFConstant(S64, 0.0);
1794   const auto One = B.buildFConstant(S64, 1.0);
1795   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1796   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1797   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1798   auto Add = B.buildSelect(S64, And, One, Zero);
1799 
1800   // TODO: Should this propagate fast-math-flags?
1801   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1802   return true;
1803 }
1804 
1805 static MachineInstrBuilder extractF64Exponent(Register Hi,
1806                                               MachineIRBuilder &B) {
1807   const unsigned FractBits = 52;
1808   const unsigned ExpBits = 11;
1809   LLT S32 = LLT::scalar(32);
1810 
1811   auto Const0 = B.buildConstant(S32, FractBits - 32);
1812   auto Const1 = B.buildConstant(S32, ExpBits);
1813 
1814   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1815     .addUse(Hi)
1816     .addUse(Const0.getReg(0))
1817     .addUse(Const1.getReg(0));
1818 
1819   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1820 }
1821 
1822 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1823   MachineInstr &MI, MachineRegisterInfo &MRI,
1824   MachineIRBuilder &B) const {
1825   const LLT S1 = LLT::scalar(1);
1826   const LLT S32 = LLT::scalar(32);
1827   const LLT S64 = LLT::scalar(64);
1828 
1829   Register Src = MI.getOperand(1).getReg();
1830   assert(MRI.getType(Src) == S64);
1831 
1832   // TODO: Should this use extract since the low half is unused?
1833   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1834   Register Hi = Unmerge.getReg(1);
1835 
1836   // Extract the upper half, since this is where we will find the sign and
1837   // exponent.
1838   auto Exp = extractF64Exponent(Hi, B);
1839 
1840   const unsigned FractBits = 52;
1841 
1842   // Extract the sign bit.
1843   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1844   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1845 
1846   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1847 
1848   const auto Zero32 = B.buildConstant(S32, 0);
1849 
1850   // Extend back to 64-bits.
1851   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1852 
1853   auto Shr = B.buildAShr(S64, FractMask, Exp);
1854   auto Not = B.buildNot(S64, Shr);
1855   auto Tmp0 = B.buildAnd(S64, Src, Not);
1856   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1857 
1858   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1859   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1860 
1861   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1862   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1863   MI.eraseFromParent();
1864   return true;
1865 }
1866 
1867 bool AMDGPULegalizerInfo::legalizeITOFP(
1868   MachineInstr &MI, MachineRegisterInfo &MRI,
1869   MachineIRBuilder &B, bool Signed) const {
1870 
1871   Register Dst = MI.getOperand(0).getReg();
1872   Register Src = MI.getOperand(1).getReg();
1873 
1874   const LLT S64 = LLT::scalar(64);
1875   const LLT S32 = LLT::scalar(32);
1876 
1877   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1878 
1879   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1880 
1881   auto CvtHi = Signed ?
1882     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1883     B.buildUITOFP(S64, Unmerge.getReg(1));
1884 
1885   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1886 
1887   auto ThirtyTwo = B.buildConstant(S32, 32);
1888   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1889     .addUse(CvtHi.getReg(0))
1890     .addUse(ThirtyTwo.getReg(0));
1891 
1892   // TODO: Should this propagate fast-math-flags?
1893   B.buildFAdd(Dst, LdExp, CvtLo);
1894   MI.eraseFromParent();
1895   return true;
1896 }
1897 
1898 // TODO: Copied from DAG implementation. Verify logic and document how this
1899 // actually works.
1900 bool AMDGPULegalizerInfo::legalizeFPTOI(
1901   MachineInstr &MI, MachineRegisterInfo &MRI,
1902   MachineIRBuilder &B, bool Signed) const {
1903 
1904   Register Dst = MI.getOperand(0).getReg();
1905   Register Src = MI.getOperand(1).getReg();
1906 
1907   const LLT S64 = LLT::scalar(64);
1908   const LLT S32 = LLT::scalar(32);
1909 
1910   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1911 
1912   unsigned Flags = MI.getFlags();
1913 
1914   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1915   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1916   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1917 
1918   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1919   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1920   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1921 
1922   auto Hi = Signed ?
1923     B.buildFPTOSI(S32, FloorMul) :
1924     B.buildFPTOUI(S32, FloorMul);
1925   auto Lo = B.buildFPTOUI(S32, Fma);
1926 
1927   B.buildMerge(Dst, { Lo, Hi });
1928   MI.eraseFromParent();
1929 
1930   return true;
1931 }
1932 
1933 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1934                                                MachineInstr &MI) const {
1935   MachineFunction &MF = Helper.MIRBuilder.getMF();
1936   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1937 
1938   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1939                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1940 
1941   // With ieee_mode disabled, the instructions have the correct behavior
1942   // already for G_FMINNUM/G_FMAXNUM
1943   if (!MFI->getMode().IEEE)
1944     return !IsIEEEOp;
1945 
1946   if (IsIEEEOp)
1947     return true;
1948 
1949   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1950 }
1951 
1952 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1953   MachineInstr &MI, MachineRegisterInfo &MRI,
1954   MachineIRBuilder &B) const {
1955   // TODO: Should move some of this into LegalizerHelper.
1956 
1957   // TODO: Promote dynamic indexing of s16 to s32
1958 
1959   // FIXME: Artifact combiner probably should have replaced the truncated
1960   // constant before this, so we shouldn't need
1961   // getConstantVRegValWithLookThrough.
1962   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1963     MI.getOperand(2).getReg(), MRI);
1964   if (!IdxVal) // Dynamic case will be selected to register indexing.
1965     return true;
1966 
1967   Register Dst = MI.getOperand(0).getReg();
1968   Register Vec = MI.getOperand(1).getReg();
1969 
1970   LLT VecTy = MRI.getType(Vec);
1971   LLT EltTy = VecTy.getElementType();
1972   assert(EltTy == MRI.getType(Dst));
1973 
1974   if (IdxVal->Value < VecTy.getNumElements())
1975     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1976   else
1977     B.buildUndef(Dst);
1978 
1979   MI.eraseFromParent();
1980   return true;
1981 }
1982 
1983 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1984   MachineInstr &MI, MachineRegisterInfo &MRI,
1985   MachineIRBuilder &B) const {
1986   // TODO: Should move some of this into LegalizerHelper.
1987 
1988   // TODO: Promote dynamic indexing of s16 to s32
1989 
1990   // FIXME: Artifact combiner probably should have replaced the truncated
1991   // constant before this, so we shouldn't need
1992   // getConstantVRegValWithLookThrough.
1993   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1994     MI.getOperand(3).getReg(), MRI);
1995   if (!IdxVal) // Dynamic case will be selected to register indexing.
1996     return true;
1997 
1998   Register Dst = MI.getOperand(0).getReg();
1999   Register Vec = MI.getOperand(1).getReg();
2000   Register Ins = MI.getOperand(2).getReg();
2001 
2002   LLT VecTy = MRI.getType(Vec);
2003   LLT EltTy = VecTy.getElementType();
2004   assert(EltTy == MRI.getType(Ins));
2005 
2006   if (IdxVal->Value < VecTy.getNumElements())
2007     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2008   else
2009     B.buildUndef(Dst);
2010 
2011   MI.eraseFromParent();
2012   return true;
2013 }
2014 
2015 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2016   MachineInstr &MI, MachineRegisterInfo &MRI,
2017   MachineIRBuilder &B) const {
2018   const LLT V2S16 = LLT::vector(2, 16);
2019 
2020   Register Dst = MI.getOperand(0).getReg();
2021   Register Src0 = MI.getOperand(1).getReg();
2022   LLT DstTy = MRI.getType(Dst);
2023   LLT SrcTy = MRI.getType(Src0);
2024 
2025   if (SrcTy == V2S16 && DstTy == V2S16 &&
2026       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2027     return true;
2028 
2029   MachineIRBuilder HelperBuilder(MI);
2030   GISelObserverWrapper DummyObserver;
2031   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2032   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2033 }
2034 
2035 bool AMDGPULegalizerInfo::legalizeSinCos(
2036   MachineInstr &MI, MachineRegisterInfo &MRI,
2037   MachineIRBuilder &B) const {
2038 
2039   Register DstReg = MI.getOperand(0).getReg();
2040   Register SrcReg = MI.getOperand(1).getReg();
2041   LLT Ty = MRI.getType(DstReg);
2042   unsigned Flags = MI.getFlags();
2043 
2044   Register TrigVal;
2045   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2046   if (ST.hasTrigReducedRange()) {
2047     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2048     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2049       .addUse(MulVal.getReg(0))
2050       .setMIFlags(Flags).getReg(0);
2051   } else
2052     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2053 
2054   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2055     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2056   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2057     .addUse(TrigVal)
2058     .setMIFlags(Flags);
2059   MI.eraseFromParent();
2060   return true;
2061 }
2062 
2063 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2064                                                   MachineIRBuilder &B,
2065                                                   const GlobalValue *GV,
2066                                                   int64_t Offset,
2067                                                   unsigned GAFlags) const {
2068   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2069   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2070   // to the following code sequence:
2071   //
2072   // For constant address space:
2073   //   s_getpc_b64 s[0:1]
2074   //   s_add_u32 s0, s0, $symbol
2075   //   s_addc_u32 s1, s1, 0
2076   //
2077   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2078   //   a fixup or relocation is emitted to replace $symbol with a literal
2079   //   constant, which is a pc-relative offset from the encoding of the $symbol
2080   //   operand to the global variable.
2081   //
2082   // For global address space:
2083   //   s_getpc_b64 s[0:1]
2084   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2085   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2086   //
2087   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2088   //   fixups or relocations are emitted to replace $symbol@*@lo and
2089   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2090   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2091   //   operand to the global variable.
2092   //
2093   // What we want here is an offset from the value returned by s_getpc
2094   // (which is the address of the s_add_u32 instruction) to the global
2095   // variable, but since the encoding of $symbol starts 4 bytes after the start
2096   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2097   // small. This requires us to add 4 to the global variable offset in order to
2098   // compute the correct address.
2099 
2100   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2101 
2102   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2103     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2104 
2105   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2106     .addDef(PCReg);
2107 
2108   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2109   if (GAFlags == SIInstrInfo::MO_NONE)
2110     MIB.addImm(0);
2111   else
2112     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2113 
2114   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2115 
2116   if (PtrTy.getSizeInBits() == 32)
2117     B.buildExtract(DstReg, PCReg, 0);
2118   return true;
2119  }
2120 
2121 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2122   MachineInstr &MI, MachineRegisterInfo &MRI,
2123   MachineIRBuilder &B) const {
2124   Register DstReg = MI.getOperand(0).getReg();
2125   LLT Ty = MRI.getType(DstReg);
2126   unsigned AS = Ty.getAddressSpace();
2127 
2128   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2129   MachineFunction &MF = B.getMF();
2130   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2131 
2132   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2133     if (!MFI->isEntryFunction()) {
2134       const Function &Fn = MF.getFunction();
2135       DiagnosticInfoUnsupported BadLDSDecl(
2136         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2137         DS_Warning);
2138       Fn.getContext().diagnose(BadLDSDecl);
2139 
2140       // We currently don't have a way to correctly allocate LDS objects that
2141       // aren't directly associated with a kernel. We do force inlining of
2142       // functions that use local objects. However, if these dead functions are
2143       // not eliminated, we don't want a compile time error. Just emit a warning
2144       // and a trap, since there should be no callable path here.
2145       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2146       B.buildUndef(DstReg);
2147       MI.eraseFromParent();
2148       return true;
2149     }
2150 
2151     // TODO: We could emit code to handle the initialization somewhere.
2152     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2153       const SITargetLowering *TLI = ST.getTargetLowering();
2154       if (!TLI->shouldUseLDSConstAddress(GV)) {
2155         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2156         return true; // Leave in place;
2157       }
2158 
2159       B.buildConstant(
2160           DstReg,
2161           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2162       MI.eraseFromParent();
2163       return true;
2164     }
2165 
2166     const Function &Fn = MF.getFunction();
2167     DiagnosticInfoUnsupported BadInit(
2168       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2169     Fn.getContext().diagnose(BadInit);
2170     return true;
2171   }
2172 
2173   const SITargetLowering *TLI = ST.getTargetLowering();
2174 
2175   if (TLI->shouldEmitFixup(GV)) {
2176     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2177     MI.eraseFromParent();
2178     return true;
2179   }
2180 
2181   if (TLI->shouldEmitPCReloc(GV)) {
2182     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2183     MI.eraseFromParent();
2184     return true;
2185   }
2186 
2187   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2188   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2189 
2190   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2191       MachinePointerInfo::getGOT(MF),
2192       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2193           MachineMemOperand::MOInvariant,
2194       8 /*Size*/, Align(8));
2195 
2196   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2197 
2198   if (Ty.getSizeInBits() == 32) {
2199     // Truncate if this is a 32-bit constant adrdess.
2200     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2201     B.buildExtract(DstReg, Load, 0);
2202   } else
2203     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2204 
2205   MI.eraseFromParent();
2206   return true;
2207 }
2208 
2209 bool AMDGPULegalizerInfo::legalizeLoad(
2210   MachineInstr &MI, MachineRegisterInfo &MRI,
2211   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2212   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2213   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2214   Observer.changingInstr(MI);
2215   MI.getOperand(1).setReg(Cast.getReg(0));
2216   Observer.changedInstr(MI);
2217   return true;
2218 }
2219 
2220 bool AMDGPULegalizerInfo::legalizeFMad(
2221   MachineInstr &MI, MachineRegisterInfo &MRI,
2222   MachineIRBuilder &B) const {
2223   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2224   assert(Ty.isScalar());
2225 
2226   MachineFunction &MF = B.getMF();
2227   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2228 
2229   // TODO: Always legal with future ftz flag.
2230   // FIXME: Do we need just output?
2231   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2232     return true;
2233   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2234     return true;
2235 
2236   MachineIRBuilder HelperBuilder(MI);
2237   GISelObserverWrapper DummyObserver;
2238   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2239   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2240 }
2241 
2242 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2243   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2244   Register DstReg = MI.getOperand(0).getReg();
2245   Register PtrReg = MI.getOperand(1).getReg();
2246   Register CmpVal = MI.getOperand(2).getReg();
2247   Register NewVal = MI.getOperand(3).getReg();
2248 
2249   assert(SITargetLowering::isFlatGlobalAddrSpace(
2250            MRI.getType(PtrReg).getAddressSpace()) &&
2251          "this should not have been custom lowered");
2252 
2253   LLT ValTy = MRI.getType(CmpVal);
2254   LLT VecTy = LLT::vector(2, ValTy);
2255 
2256   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2257 
2258   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2259     .addDef(DstReg)
2260     .addUse(PtrReg)
2261     .addUse(PackedVal)
2262     .setMemRefs(MI.memoperands());
2263 
2264   MI.eraseFromParent();
2265   return true;
2266 }
2267 
2268 bool AMDGPULegalizerInfo::legalizeFlog(
2269   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2270   Register Dst = MI.getOperand(0).getReg();
2271   Register Src = MI.getOperand(1).getReg();
2272   LLT Ty = B.getMRI()->getType(Dst);
2273   unsigned Flags = MI.getFlags();
2274 
2275   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2276   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2277 
2278   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2279   MI.eraseFromParent();
2280   return true;
2281 }
2282 
2283 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2284                                        MachineIRBuilder &B) const {
2285   Register Dst = MI.getOperand(0).getReg();
2286   Register Src = MI.getOperand(1).getReg();
2287   unsigned Flags = MI.getFlags();
2288   LLT Ty = B.getMRI()->getType(Dst);
2289 
2290   auto K = B.buildFConstant(Ty, numbers::log2e);
2291   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2292   B.buildFExp2(Dst, Mul, Flags);
2293   MI.eraseFromParent();
2294   return true;
2295 }
2296 
2297 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2298                                        MachineIRBuilder &B) const {
2299   Register Dst = MI.getOperand(0).getReg();
2300   Register Src0 = MI.getOperand(1).getReg();
2301   Register Src1 = MI.getOperand(2).getReg();
2302   unsigned Flags = MI.getFlags();
2303   LLT Ty = B.getMRI()->getType(Dst);
2304   const LLT S16 = LLT::scalar(16);
2305   const LLT S32 = LLT::scalar(32);
2306 
2307   if (Ty == S32) {
2308     auto Log = B.buildFLog2(S32, Src0, Flags);
2309     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2310       .addUse(Log.getReg(0))
2311       .addUse(Src1)
2312       .setMIFlags(Flags);
2313     B.buildFExp2(Dst, Mul, Flags);
2314   } else if (Ty == S16) {
2315     // There's no f16 fmul_legacy, so we need to convert for it.
2316     auto Log = B.buildFLog2(S16, Src0, Flags);
2317     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2318     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2319     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2320       .addUse(Ext0.getReg(0))
2321       .addUse(Ext1.getReg(0))
2322       .setMIFlags(Flags);
2323 
2324     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2325   } else
2326     return false;
2327 
2328   MI.eraseFromParent();
2329   return true;
2330 }
2331 
2332 // Find a source register, ignoring any possible source modifiers.
2333 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2334   Register ModSrc = OrigSrc;
2335   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2336     ModSrc = SrcFNeg->getOperand(1).getReg();
2337     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2338       ModSrc = SrcFAbs->getOperand(1).getReg();
2339   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2340     ModSrc = SrcFAbs->getOperand(1).getReg();
2341   return ModSrc;
2342 }
2343 
2344 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2345                                          MachineRegisterInfo &MRI,
2346                                          MachineIRBuilder &B) const {
2347 
2348   const LLT S1 = LLT::scalar(1);
2349   const LLT S64 = LLT::scalar(64);
2350   Register Dst = MI.getOperand(0).getReg();
2351   Register OrigSrc = MI.getOperand(1).getReg();
2352   unsigned Flags = MI.getFlags();
2353   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2354          "this should not have been custom lowered");
2355 
2356   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2357   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2358   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2359   // V_FRACT bug is:
2360   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2361   //
2362   // Convert floor(x) to (x - fract(x))
2363 
2364   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2365     .addUse(OrigSrc)
2366     .setMIFlags(Flags);
2367 
2368   // Give source modifier matching some assistance before obscuring a foldable
2369   // pattern.
2370 
2371   // TODO: We can avoid the neg on the fract? The input sign to fract
2372   // shouldn't matter?
2373   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2374 
2375   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2376 
2377   Register Min = MRI.createGenericVirtualRegister(S64);
2378 
2379   // We don't need to concern ourselves with the snan handling difference, so
2380   // use the one which will directly select.
2381   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2382   if (MFI->getMode().IEEE)
2383     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2384   else
2385     B.buildFMinNum(Min, Fract, Const, Flags);
2386 
2387   Register CorrectedFract = Min;
2388   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2389     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2390     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2391   }
2392 
2393   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2394   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2395 
2396   MI.eraseFromParent();
2397   return true;
2398 }
2399 
2400 // Turn an illegal packed v2s16 build vector into bit operations.
2401 // TODO: This should probably be a bitcast action in LegalizerHelper.
2402 bool AMDGPULegalizerInfo::legalizeBuildVector(
2403   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2404   Register Dst = MI.getOperand(0).getReg();
2405   const LLT S32 = LLT::scalar(32);
2406   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2407 
2408   Register Src0 = MI.getOperand(1).getReg();
2409   Register Src1 = MI.getOperand(2).getReg();
2410   assert(MRI.getType(Src0) == LLT::scalar(16));
2411 
2412   auto Merge = B.buildMerge(S32, {Src0, Src1});
2413   B.buildBitcast(Dst, Merge);
2414 
2415   MI.eraseFromParent();
2416   return true;
2417 }
2418 
2419 // Return the use branch instruction, otherwise null if the usage is invalid.
2420 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2421                                        MachineRegisterInfo &MRI,
2422                                        MachineInstr *&Br,
2423                                        MachineBasicBlock *&UncondBrTarget) {
2424   Register CondDef = MI.getOperand(0).getReg();
2425   if (!MRI.hasOneNonDBGUse(CondDef))
2426     return nullptr;
2427 
2428   MachineBasicBlock *Parent = MI.getParent();
2429   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2430   if (UseMI.getParent() != Parent ||
2431       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2432     return nullptr;
2433 
2434   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2435   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2436   if (Next == Parent->end()) {
2437     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2438     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2439       return nullptr;
2440     UncondBrTarget = &*NextMBB;
2441   } else {
2442     if (Next->getOpcode() != AMDGPU::G_BR)
2443       return nullptr;
2444     Br = &*Next;
2445     UncondBrTarget = Br->getOperand(0).getMBB();
2446   }
2447 
2448   return &UseMI;
2449 }
2450 
2451 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2452                                                MachineRegisterInfo &MRI,
2453                                                Register LiveIn,
2454                                                Register PhyReg) const {
2455   assert(PhyReg.isPhysical() && "Physical register expected");
2456 
2457   // Insert the live-in copy, if required, by defining destination virtual
2458   // register.
2459   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2460   if (!MRI.getVRegDef(LiveIn)) {
2461     // FIXME: Should have scoped insert pt
2462     MachineBasicBlock &OrigInsBB = B.getMBB();
2463     auto OrigInsPt = B.getInsertPt();
2464 
2465     MachineBasicBlock &EntryMBB = B.getMF().front();
2466     EntryMBB.addLiveIn(PhyReg);
2467     B.setInsertPt(EntryMBB, EntryMBB.begin());
2468     B.buildCopy(LiveIn, PhyReg);
2469 
2470     B.setInsertPt(OrigInsBB, OrigInsPt);
2471   }
2472 
2473   return LiveIn;
2474 }
2475 
2476 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2477                                                 MachineRegisterInfo &MRI,
2478                                                 Register PhyReg, LLT Ty,
2479                                                 bool InsertLiveInCopy) const {
2480   assert(PhyReg.isPhysical() && "Physical register expected");
2481 
2482   // Get or create virtual live-in regester
2483   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2484   if (!LiveIn) {
2485     LiveIn = MRI.createGenericVirtualRegister(Ty);
2486     MRI.addLiveIn(PhyReg, LiveIn);
2487   }
2488 
2489   // When the actual true copy required is from virtual register to physical
2490   // register (to be inserted later), live-in copy insertion from physical
2491   // to register virtual register is not required
2492   if (!InsertLiveInCopy)
2493     return LiveIn;
2494 
2495   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2496 }
2497 
2498 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2499                                          const ArgDescriptor *Arg,
2500                                          const TargetRegisterClass *ArgRC,
2501                                          LLT ArgTy) const {
2502   MCRegister SrcReg = Arg->getRegister();
2503   assert(SrcReg.isPhysical() && "Physical register expected");
2504   assert(DstReg.isVirtual() && "Virtual register expected");
2505 
2506   MachineRegisterInfo &MRI = *B.getMRI();
2507   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, ArgTy);
2508 
2509   if (Arg->isMasked()) {
2510     // TODO: Should we try to emit this once in the entry block?
2511     const LLT S32 = LLT::scalar(32);
2512     const unsigned Mask = Arg->getMask();
2513     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2514 
2515     Register AndMaskSrc = LiveIn;
2516 
2517     if (Shift != 0) {
2518       auto ShiftAmt = B.buildConstant(S32, Shift);
2519       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2520     }
2521 
2522     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2523   } else {
2524     B.buildCopy(DstReg, LiveIn);
2525   }
2526 
2527   return true;
2528 }
2529 
2530 bool AMDGPULegalizerInfo::loadInputValue(
2531     Register DstReg, MachineIRBuilder &B,
2532     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2533   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2534   const ArgDescriptor *Arg;
2535   const TargetRegisterClass *ArgRC;
2536   LLT ArgTy;
2537   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2538 
2539   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2540     return false; // TODO: Handle these
2541   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2542 }
2543 
2544 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2545     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2546     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2547   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2548     return false;
2549 
2550   MI.eraseFromParent();
2551   return true;
2552 }
2553 
2554 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2555                                        MachineRegisterInfo &MRI,
2556                                        MachineIRBuilder &B) const {
2557   Register Dst = MI.getOperand(0).getReg();
2558   LLT DstTy = MRI.getType(Dst);
2559   LLT S16 = LLT::scalar(16);
2560   LLT S32 = LLT::scalar(32);
2561   LLT S64 = LLT::scalar(64);
2562 
2563   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2564     return true;
2565 
2566   if (DstTy == S16)
2567     return legalizeFDIV16(MI, MRI, B);
2568   if (DstTy == S32)
2569     return legalizeFDIV32(MI, MRI, B);
2570   if (DstTy == S64)
2571     return legalizeFDIV64(MI, MRI, B);
2572 
2573   return false;
2574 }
2575 
2576 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2577                                                   Register DstReg,
2578                                                   Register X,
2579                                                   Register Y,
2580                                                   bool IsDiv) const {
2581   const LLT S1 = LLT::scalar(1);
2582   const LLT S32 = LLT::scalar(32);
2583 
2584   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2585   // algorithm used here.
2586 
2587   // Initial estimate of inv(y).
2588   auto FloatY = B.buildUITOFP(S32, Y);
2589   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2590   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2591   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2592   auto Z = B.buildFPTOUI(S32, ScaledY);
2593 
2594   // One round of UNR.
2595   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2596   auto NegYZ = B.buildMul(S32, NegY, Z);
2597   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2598 
2599   // Quotient/remainder estimate.
2600   auto Q = B.buildUMulH(S32, X, Z);
2601   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2602 
2603   // First quotient/remainder refinement.
2604   auto One = B.buildConstant(S32, 1);
2605   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2606   if (IsDiv)
2607     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2608   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2609 
2610   // Second quotient/remainder refinement.
2611   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2612   if (IsDiv)
2613     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2614   else
2615     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2616 }
2617 
2618 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2619                                               MachineRegisterInfo &MRI,
2620                                               MachineIRBuilder &B) const {
2621   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2622   Register DstReg = MI.getOperand(0).getReg();
2623   Register Num = MI.getOperand(1).getReg();
2624   Register Den = MI.getOperand(2).getReg();
2625   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2626   MI.eraseFromParent();
2627   return true;
2628 }
2629 
2630 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2631 //
2632 // Return lo, hi of result
2633 //
2634 // %cvt.lo = G_UITOFP Val.lo
2635 // %cvt.hi = G_UITOFP Val.hi
2636 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2637 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2638 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2639 // %mul2 = G_FMUL %mul1, 2**(-32)
2640 // %trunc = G_INTRINSIC_TRUNC %mul2
2641 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2642 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2643 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2644                                                        Register Val) {
2645   const LLT S32 = LLT::scalar(32);
2646   auto Unmerge = B.buildUnmerge(S32, Val);
2647 
2648   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2649   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2650 
2651   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2652                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2653 
2654   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2655   auto Mul1 =
2656       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2657 
2658   // 2**(-32)
2659   auto Mul2 =
2660       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2661   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2662 
2663   // -(2**32)
2664   auto Mad2 = B.buildFMAD(S32, Trunc,
2665                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2666 
2667   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2668   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2669 
2670   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2671 }
2672 
2673 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2674                                                   Register DstReg,
2675                                                   Register Numer,
2676                                                   Register Denom,
2677                                                   bool IsDiv) const {
2678   const LLT S32 = LLT::scalar(32);
2679   const LLT S64 = LLT::scalar(64);
2680   const LLT S1 = LLT::scalar(1);
2681   Register RcpLo, RcpHi;
2682 
2683   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2684 
2685   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2686 
2687   auto Zero64 = B.buildConstant(S64, 0);
2688   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2689 
2690   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2691   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2692 
2693   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2694   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2695   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2696 
2697   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2698   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2699   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2700   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2701 
2702   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2703   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2704   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2705   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2706   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2707 
2708   auto Zero32 = B.buildConstant(S32, 0);
2709   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2710   auto Add2_HiC =
2711       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2712   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2713   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2714 
2715   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2716   Register NumerLo = UnmergeNumer.getReg(0);
2717   Register NumerHi = UnmergeNumer.getReg(1);
2718 
2719   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2720   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2721   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2722   Register Mul3_Lo = UnmergeMul3.getReg(0);
2723   Register Mul3_Hi = UnmergeMul3.getReg(1);
2724   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2725   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2726   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2727   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2728 
2729   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2730   Register DenomLo = UnmergeDenom.getReg(0);
2731   Register DenomHi = UnmergeDenom.getReg(1);
2732 
2733   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2734   auto C1 = B.buildSExt(S32, CmpHi);
2735 
2736   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2737   auto C2 = B.buildSExt(S32, CmpLo);
2738 
2739   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2740   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2741 
2742   // TODO: Here and below portions of the code can be enclosed into if/endif.
2743   // Currently control flow is unconditional and we have 4 selects after
2744   // potential endif to substitute PHIs.
2745 
2746   // if C3 != 0 ...
2747   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2748   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2749   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2750   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2751 
2752   auto One64 = B.buildConstant(S64, 1);
2753   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2754 
2755   auto C4 =
2756       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2757   auto C5 =
2758       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2759   auto C6 = B.buildSelect(
2760       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2761 
2762   // if (C6 != 0)
2763   auto Add4 = B.buildAdd(S64, Add3, One64);
2764   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2765 
2766   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2767   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2768   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2769 
2770   // endif C6
2771   // endif C3
2772 
2773   if (IsDiv) {
2774     auto Sel1 = B.buildSelect(
2775         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2776     B.buildSelect(DstReg,
2777                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2778   } else {
2779     auto Sel2 = B.buildSelect(
2780         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2781     B.buildSelect(DstReg,
2782                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2783   }
2784 }
2785 
2786 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2787                                             MachineRegisterInfo &MRI,
2788                                             MachineIRBuilder &B) const {
2789   const LLT S64 = LLT::scalar(64);
2790   const LLT S32 = LLT::scalar(32);
2791   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2792   Register DstReg = MI.getOperand(0).getReg();
2793   Register Num = MI.getOperand(1).getReg();
2794   Register Den = MI.getOperand(2).getReg();
2795   LLT Ty = MRI.getType(DstReg);
2796 
2797   if (Ty == S32)
2798     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2799   else if (Ty == S64)
2800     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2801   else
2802     return false;
2803 
2804   MI.eraseFromParent();
2805   return true;
2806 
2807 }
2808 
2809 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2810                                             MachineRegisterInfo &MRI,
2811                                             MachineIRBuilder &B) const {
2812   const LLT S64 = LLT::scalar(64);
2813   const LLT S32 = LLT::scalar(32);
2814 
2815   Register DstReg = MI.getOperand(0).getReg();
2816   const LLT Ty = MRI.getType(DstReg);
2817   if (Ty != S32 && Ty != S64)
2818     return false;
2819 
2820   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2821 
2822   Register LHS = MI.getOperand(1).getReg();
2823   Register RHS = MI.getOperand(2).getReg();
2824 
2825   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2826   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2827   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2828 
2829   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2830   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2831 
2832   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2833   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2834 
2835   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2836   if (Ty == S32)
2837     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2838   else
2839     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2840 
2841   Register Sign;
2842   if (IsDiv)
2843     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2844   else
2845     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2846 
2847   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2848   B.buildSub(DstReg, UDivRem, Sign);
2849 
2850   MI.eraseFromParent();
2851   return true;
2852 }
2853 
2854 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2855                                                  MachineRegisterInfo &MRI,
2856                                                  MachineIRBuilder &B) const {
2857   Register Res = MI.getOperand(0).getReg();
2858   Register LHS = MI.getOperand(1).getReg();
2859   Register RHS = MI.getOperand(2).getReg();
2860 
2861   uint16_t Flags = MI.getFlags();
2862 
2863   LLT ResTy = MRI.getType(Res);
2864   LLT S32 = LLT::scalar(32);
2865   LLT S64 = LLT::scalar(64);
2866 
2867   const MachineFunction &MF = B.getMF();
2868   bool Unsafe =
2869     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2870 
2871   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2872     return false;
2873 
2874   if (!Unsafe && ResTy == S32 &&
2875       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2876     return false;
2877 
2878   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2879     // 1 / x -> RCP(x)
2880     if (CLHS->isExactlyValue(1.0)) {
2881       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2882         .addUse(RHS)
2883         .setMIFlags(Flags);
2884 
2885       MI.eraseFromParent();
2886       return true;
2887     }
2888 
2889     // -1 / x -> RCP( FNEG(x) )
2890     if (CLHS->isExactlyValue(-1.0)) {
2891       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2892       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2893         .addUse(FNeg.getReg(0))
2894         .setMIFlags(Flags);
2895 
2896       MI.eraseFromParent();
2897       return true;
2898     }
2899   }
2900 
2901   // x / y -> x * (1.0 / y)
2902   if (Unsafe) {
2903     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2904       .addUse(RHS)
2905       .setMIFlags(Flags);
2906     B.buildFMul(Res, LHS, RCP, Flags);
2907 
2908     MI.eraseFromParent();
2909     return true;
2910   }
2911 
2912   return false;
2913 }
2914 
2915 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2916                                          MachineRegisterInfo &MRI,
2917                                          MachineIRBuilder &B) const {
2918   Register Res = MI.getOperand(0).getReg();
2919   Register LHS = MI.getOperand(1).getReg();
2920   Register RHS = MI.getOperand(2).getReg();
2921 
2922   uint16_t Flags = MI.getFlags();
2923 
2924   LLT S16 = LLT::scalar(16);
2925   LLT S32 = LLT::scalar(32);
2926 
2927   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2928   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2929 
2930   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2931     .addUse(RHSExt.getReg(0))
2932     .setMIFlags(Flags);
2933 
2934   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2935   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2936 
2937   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2938     .addUse(RDst.getReg(0))
2939     .addUse(RHS)
2940     .addUse(LHS)
2941     .setMIFlags(Flags);
2942 
2943   MI.eraseFromParent();
2944   return true;
2945 }
2946 
2947 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2948 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2949 static void toggleSPDenormMode(bool Enable,
2950                                MachineIRBuilder &B,
2951                                const GCNSubtarget &ST,
2952                                AMDGPU::SIModeRegisterDefaults Mode) {
2953   // Set SP denorm mode to this value.
2954   unsigned SPDenormMode =
2955     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2956 
2957   if (ST.hasDenormModeInst()) {
2958     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2959     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2960 
2961     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2962     B.buildInstr(AMDGPU::S_DENORM_MODE)
2963       .addImm(NewDenormModeValue);
2964 
2965   } else {
2966     // Select FP32 bit field in mode register.
2967     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2968                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2969                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2970 
2971     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2972       .addImm(SPDenormMode)
2973       .addImm(SPDenormModeBitField);
2974   }
2975 }
2976 
2977 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2978                                          MachineRegisterInfo &MRI,
2979                                          MachineIRBuilder &B) const {
2980   Register Res = MI.getOperand(0).getReg();
2981   Register LHS = MI.getOperand(1).getReg();
2982   Register RHS = MI.getOperand(2).getReg();
2983   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2984   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2985 
2986   uint16_t Flags = MI.getFlags();
2987 
2988   LLT S32 = LLT::scalar(32);
2989   LLT S1 = LLT::scalar(1);
2990 
2991   auto One = B.buildFConstant(S32, 1.0f);
2992 
2993   auto DenominatorScaled =
2994     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2995       .addUse(LHS)
2996       .addUse(RHS)
2997       .addImm(0)
2998       .setMIFlags(Flags);
2999   auto NumeratorScaled =
3000     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3001       .addUse(LHS)
3002       .addUse(RHS)
3003       .addImm(1)
3004       .setMIFlags(Flags);
3005 
3006   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3007     .addUse(DenominatorScaled.getReg(0))
3008     .setMIFlags(Flags);
3009   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3010 
3011   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3012   // aren't modeled as reading it.
3013   if (!Mode.allFP32Denormals())
3014     toggleSPDenormMode(true, B, ST, Mode);
3015 
3016   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3017   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3018   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3019   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3020   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3021   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3022 
3023   if (!Mode.allFP32Denormals())
3024     toggleSPDenormMode(false, B, ST, Mode);
3025 
3026   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3027     .addUse(Fma4.getReg(0))
3028     .addUse(Fma1.getReg(0))
3029     .addUse(Fma3.getReg(0))
3030     .addUse(NumeratorScaled.getReg(1))
3031     .setMIFlags(Flags);
3032 
3033   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3034     .addUse(Fmas.getReg(0))
3035     .addUse(RHS)
3036     .addUse(LHS)
3037     .setMIFlags(Flags);
3038 
3039   MI.eraseFromParent();
3040   return true;
3041 }
3042 
3043 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3044                                          MachineRegisterInfo &MRI,
3045                                          MachineIRBuilder &B) const {
3046   Register Res = MI.getOperand(0).getReg();
3047   Register LHS = MI.getOperand(1).getReg();
3048   Register RHS = MI.getOperand(2).getReg();
3049 
3050   uint16_t Flags = MI.getFlags();
3051 
3052   LLT S64 = LLT::scalar(64);
3053   LLT S1 = LLT::scalar(1);
3054 
3055   auto One = B.buildFConstant(S64, 1.0);
3056 
3057   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3058     .addUse(LHS)
3059     .addUse(RHS)
3060     .addImm(0)
3061     .setMIFlags(Flags);
3062 
3063   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3064 
3065   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3066     .addUse(DivScale0.getReg(0))
3067     .setMIFlags(Flags);
3068 
3069   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3070   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3071   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3072 
3073   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3074     .addUse(LHS)
3075     .addUse(RHS)
3076     .addImm(1)
3077     .setMIFlags(Flags);
3078 
3079   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3080   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3081   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3082 
3083   Register Scale;
3084   if (!ST.hasUsableDivScaleConditionOutput()) {
3085     // Workaround a hardware bug on SI where the condition output from div_scale
3086     // is not usable.
3087 
3088     LLT S32 = LLT::scalar(32);
3089 
3090     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3091     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3092     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3093     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3094 
3095     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3096                               Scale1Unmerge.getReg(1));
3097     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3098                               Scale0Unmerge.getReg(1));
3099     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3100   } else {
3101     Scale = DivScale1.getReg(1);
3102   }
3103 
3104   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3105     .addUse(Fma4.getReg(0))
3106     .addUse(Fma3.getReg(0))
3107     .addUse(Mul.getReg(0))
3108     .addUse(Scale)
3109     .setMIFlags(Flags);
3110 
3111   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3112     .addUse(Fmas.getReg(0))
3113     .addUse(RHS)
3114     .addUse(LHS)
3115     .setMIFlags(Flags);
3116 
3117   MI.eraseFromParent();
3118   return true;
3119 }
3120 
3121 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3122                                                  MachineRegisterInfo &MRI,
3123                                                  MachineIRBuilder &B) const {
3124   Register Res = MI.getOperand(0).getReg();
3125   Register LHS = MI.getOperand(2).getReg();
3126   Register RHS = MI.getOperand(3).getReg();
3127   uint16_t Flags = MI.getFlags();
3128 
3129   LLT S32 = LLT::scalar(32);
3130   LLT S1 = LLT::scalar(1);
3131 
3132   auto Abs = B.buildFAbs(S32, RHS, Flags);
3133   const APFloat C0Val(1.0f);
3134 
3135   auto C0 = B.buildConstant(S32, 0x6f800000);
3136   auto C1 = B.buildConstant(S32, 0x2f800000);
3137   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3138 
3139   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3140   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3141 
3142   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3143 
3144   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3145     .addUse(Mul0.getReg(0))
3146     .setMIFlags(Flags);
3147 
3148   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3149 
3150   B.buildFMul(Res, Sel, Mul1, Flags);
3151 
3152   MI.eraseFromParent();
3153   return true;
3154 }
3155 
3156 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3157                                             MachineRegisterInfo &MRI,
3158                                             MachineIRBuilder &B) const {
3159   uint64_t Offset =
3160     ST.getTargetLowering()->getImplicitParameterOffset(
3161       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3162   LLT DstTy = MRI.getType(DstReg);
3163   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3164 
3165   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3166   if (!loadInputValue(KernargPtrReg, B,
3167                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3168     return false;
3169 
3170   // FIXME: This should be nuw
3171   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3172   return true;
3173 }
3174 
3175 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3176                                                  MachineRegisterInfo &MRI,
3177                                                  MachineIRBuilder &B) const {
3178   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3179   if (!MFI->isEntryFunction()) {
3180     return legalizePreloadedArgIntrin(MI, MRI, B,
3181                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3182   }
3183 
3184   Register DstReg = MI.getOperand(0).getReg();
3185   if (!getImplicitArgPtr(DstReg, MRI, B))
3186     return false;
3187 
3188   MI.eraseFromParent();
3189   return true;
3190 }
3191 
3192 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3193                                               MachineRegisterInfo &MRI,
3194                                               MachineIRBuilder &B,
3195                                               unsigned AddrSpace) const {
3196   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3197   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3198   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3199   MI.eraseFromParent();
3200   return true;
3201 }
3202 
3203 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3204 // offset (the offset that is included in bounds checking and swizzling, to be
3205 // split between the instruction's voffset and immoffset fields) and soffset
3206 // (the offset that is excluded from bounds checking and swizzling, to go in
3207 // the instruction's soffset field).  This function takes the first kind of
3208 // offset and figures out how to split it between voffset and immoffset.
3209 std::tuple<Register, unsigned, unsigned>
3210 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3211                                         Register OrigOffset) const {
3212   const unsigned MaxImm = 4095;
3213   Register BaseReg;
3214   unsigned TotalConstOffset;
3215   MachineInstr *OffsetDef;
3216   const LLT S32 = LLT::scalar(32);
3217 
3218   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3219     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3220 
3221   unsigned ImmOffset = TotalConstOffset;
3222 
3223   // If the immediate value is too big for the immoffset field, put the value
3224   // and -4096 into the immoffset field so that the value that is copied/added
3225   // for the voffset field is a multiple of 4096, and it stands more chance
3226   // of being CSEd with the copy/add for another similar load/store.
3227   // However, do not do that rounding down to a multiple of 4096 if that is a
3228   // negative number, as it appears to be illegal to have a negative offset
3229   // in the vgpr, even if adding the immediate offset makes it positive.
3230   unsigned Overflow = ImmOffset & ~MaxImm;
3231   ImmOffset -= Overflow;
3232   if ((int32_t)Overflow < 0) {
3233     Overflow += ImmOffset;
3234     ImmOffset = 0;
3235   }
3236 
3237   if (Overflow != 0) {
3238     if (!BaseReg) {
3239       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3240     } else {
3241       auto OverflowVal = B.buildConstant(S32, Overflow);
3242       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3243     }
3244   }
3245 
3246   if (!BaseReg)
3247     BaseReg = B.buildConstant(S32, 0).getReg(0);
3248 
3249   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3250 }
3251 
3252 /// Handle register layout difference for f16 images for some subtargets.
3253 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3254                                              MachineRegisterInfo &MRI,
3255                                              Register Reg) const {
3256   if (!ST.hasUnpackedD16VMem())
3257     return Reg;
3258 
3259   const LLT S16 = LLT::scalar(16);
3260   const LLT S32 = LLT::scalar(32);
3261   LLT StoreVT = MRI.getType(Reg);
3262   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3263 
3264   auto Unmerge = B.buildUnmerge(S16, Reg);
3265 
3266   SmallVector<Register, 4> WideRegs;
3267   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3268     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3269 
3270   int NumElts = StoreVT.getNumElements();
3271 
3272   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3273 }
3274 
3275 Register AMDGPULegalizerInfo::fixStoreSourceType(
3276   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3277   MachineRegisterInfo *MRI = B.getMRI();
3278   LLT Ty = MRI->getType(VData);
3279 
3280   const LLT S16 = LLT::scalar(16);
3281 
3282   // Fixup illegal register types for i8 stores.
3283   if (Ty == LLT::scalar(8) || Ty == S16) {
3284     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3285     return AnyExt;
3286   }
3287 
3288   if (Ty.isVector()) {
3289     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3290       if (IsFormat)
3291         return handleD16VData(B, *MRI, VData);
3292     }
3293   }
3294 
3295   return VData;
3296 }
3297 
3298 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3299                                               MachineRegisterInfo &MRI,
3300                                               MachineIRBuilder &B,
3301                                               bool IsTyped,
3302                                               bool IsFormat) const {
3303   Register VData = MI.getOperand(1).getReg();
3304   LLT Ty = MRI.getType(VData);
3305   LLT EltTy = Ty.getScalarType();
3306   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3307   const LLT S32 = LLT::scalar(32);
3308 
3309   VData = fixStoreSourceType(B, VData, IsFormat);
3310   Register RSrc = MI.getOperand(2).getReg();
3311 
3312   MachineMemOperand *MMO = *MI.memoperands_begin();
3313   const int MemSize = MMO->getSize();
3314 
3315   unsigned ImmOffset;
3316   unsigned TotalOffset;
3317 
3318   // The typed intrinsics add an immediate after the registers.
3319   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3320 
3321   // The struct intrinsic variants add one additional operand over raw.
3322   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3323   Register VIndex;
3324   int OpOffset = 0;
3325   if (HasVIndex) {
3326     VIndex = MI.getOperand(3).getReg();
3327     OpOffset = 1;
3328   }
3329 
3330   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3331   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3332 
3333   unsigned Format = 0;
3334   if (IsTyped) {
3335     Format = MI.getOperand(5 + OpOffset).getImm();
3336     ++OpOffset;
3337   }
3338 
3339   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3340 
3341   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3342   if (TotalOffset != 0)
3343     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3344 
3345   unsigned Opc;
3346   if (IsTyped) {
3347     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3348                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3349   } else if (IsFormat) {
3350     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3351                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3352   } else {
3353     switch (MemSize) {
3354     case 1:
3355       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3356       break;
3357     case 2:
3358       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3359       break;
3360     default:
3361       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3362       break;
3363     }
3364   }
3365 
3366   if (!VIndex)
3367     VIndex = B.buildConstant(S32, 0).getReg(0);
3368 
3369   auto MIB = B.buildInstr(Opc)
3370     .addUse(VData)              // vdata
3371     .addUse(RSrc)               // rsrc
3372     .addUse(VIndex)             // vindex
3373     .addUse(VOffset)            // voffset
3374     .addUse(SOffset)            // soffset
3375     .addImm(ImmOffset);         // offset(imm)
3376 
3377   if (IsTyped)
3378     MIB.addImm(Format);
3379 
3380   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3381      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3382      .addMemOperand(MMO);
3383 
3384   MI.eraseFromParent();
3385   return true;
3386 }
3387 
3388 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3389                                              MachineRegisterInfo &MRI,
3390                                              MachineIRBuilder &B,
3391                                              bool IsFormat,
3392                                              bool IsTyped) const {
3393   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3394   MachineMemOperand *MMO = *MI.memoperands_begin();
3395   const int MemSize = MMO->getSize();
3396   const LLT S32 = LLT::scalar(32);
3397 
3398   Register Dst = MI.getOperand(0).getReg();
3399   Register RSrc = MI.getOperand(2).getReg();
3400 
3401   // The typed intrinsics add an immediate after the registers.
3402   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3403 
3404   // The struct intrinsic variants add one additional operand over raw.
3405   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3406   Register VIndex;
3407   int OpOffset = 0;
3408   if (HasVIndex) {
3409     VIndex = MI.getOperand(3).getReg();
3410     OpOffset = 1;
3411   }
3412 
3413   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3414   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3415 
3416   unsigned Format = 0;
3417   if (IsTyped) {
3418     Format = MI.getOperand(5 + OpOffset).getImm();
3419     ++OpOffset;
3420   }
3421 
3422   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3423   unsigned ImmOffset;
3424   unsigned TotalOffset;
3425 
3426   LLT Ty = MRI.getType(Dst);
3427   LLT EltTy = Ty.getScalarType();
3428   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3429   const bool Unpacked = ST.hasUnpackedD16VMem();
3430 
3431   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3432   if (TotalOffset != 0)
3433     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3434 
3435   unsigned Opc;
3436 
3437   if (IsTyped) {
3438     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3439                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3440   } else if (IsFormat) {
3441     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3442                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3443   } else {
3444     switch (MemSize) {
3445     case 1:
3446       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3447       break;
3448     case 2:
3449       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3450       break;
3451     default:
3452       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3453       break;
3454     }
3455   }
3456 
3457   Register LoadDstReg;
3458 
3459   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3460   LLT UnpackedTy = Ty.changeElementSize(32);
3461 
3462   if (IsExtLoad)
3463     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3464   else if (Unpacked && IsD16 && Ty.isVector())
3465     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3466   else
3467     LoadDstReg = Dst;
3468 
3469   if (!VIndex)
3470     VIndex = B.buildConstant(S32, 0).getReg(0);
3471 
3472   auto MIB = B.buildInstr(Opc)
3473     .addDef(LoadDstReg)         // vdata
3474     .addUse(RSrc)               // rsrc
3475     .addUse(VIndex)             // vindex
3476     .addUse(VOffset)            // voffset
3477     .addUse(SOffset)            // soffset
3478     .addImm(ImmOffset);         // offset(imm)
3479 
3480   if (IsTyped)
3481     MIB.addImm(Format);
3482 
3483   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3484      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3485      .addMemOperand(MMO);
3486 
3487   if (LoadDstReg != Dst) {
3488     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3489 
3490     // Widen result for extending loads was widened.
3491     if (IsExtLoad)
3492       B.buildTrunc(Dst, LoadDstReg);
3493     else {
3494       // Repack to original 16-bit vector result
3495       // FIXME: G_TRUNC should work, but legalization currently fails
3496       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3497       SmallVector<Register, 4> Repack;
3498       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3499         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3500       B.buildMerge(Dst, Repack);
3501     }
3502   }
3503 
3504   MI.eraseFromParent();
3505   return true;
3506 }
3507 
3508 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3509                                                MachineIRBuilder &B,
3510                                                bool IsInc) const {
3511   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3512                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3513   B.buildInstr(Opc)
3514     .addDef(MI.getOperand(0).getReg())
3515     .addUse(MI.getOperand(2).getReg())
3516     .addUse(MI.getOperand(3).getReg())
3517     .cloneMemRefs(MI);
3518   MI.eraseFromParent();
3519   return true;
3520 }
3521 
3522 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3523   switch (IntrID) {
3524   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3525   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3526     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3527   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3528   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3529     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3530   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3531   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3532     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3533   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3534   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3535     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3536   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3537   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3538     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3539   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3540   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3541     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3542   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3543   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3544     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3545   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3546   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3547     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3548   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3549   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3550     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3551   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3552   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3553     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3554   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3555   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3556     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3557   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3558   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3559     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3560   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3561   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3562     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3563   default:
3564     llvm_unreachable("unhandled atomic opcode");
3565   }
3566 }
3567 
3568 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3569                                                MachineIRBuilder &B,
3570                                                Intrinsic::ID IID) const {
3571   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3572                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3573 
3574   Register Dst = MI.getOperand(0).getReg();
3575   Register VData = MI.getOperand(2).getReg();
3576 
3577   Register CmpVal;
3578   int OpOffset = 0;
3579 
3580   if (IsCmpSwap) {
3581     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3582     ++OpOffset;
3583   }
3584 
3585   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3586   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3587 
3588   // The struct intrinsic variants add one additional operand over raw.
3589   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3590   Register VIndex;
3591   if (HasVIndex) {
3592     VIndex = MI.getOperand(4 + OpOffset).getReg();
3593     ++OpOffset;
3594   }
3595 
3596   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3597   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3598   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3599 
3600   MachineMemOperand *MMO = *MI.memoperands_begin();
3601 
3602   unsigned ImmOffset;
3603   unsigned TotalOffset;
3604   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3605   if (TotalOffset != 0)
3606     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3607 
3608   if (!VIndex)
3609     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3610 
3611   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3612     .addDef(Dst)
3613     .addUse(VData); // vdata
3614 
3615   if (IsCmpSwap)
3616     MIB.addReg(CmpVal);
3617 
3618   MIB.addUse(RSrc)               // rsrc
3619      .addUse(VIndex)             // vindex
3620      .addUse(VOffset)            // voffset
3621      .addUse(SOffset)            // soffset
3622      .addImm(ImmOffset)          // offset(imm)
3623      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3624      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3625      .addMemOperand(MMO);
3626 
3627   MI.eraseFromParent();
3628   return true;
3629 }
3630 
3631 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3632 /// vector with s16 typed elements.
3633 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3634                                         SmallVectorImpl<Register> &PackedAddrs,
3635                                         int AddrIdx, int DimIdx, int EndIdx,
3636                                         int NumGradients) {
3637   const LLT S16 = LLT::scalar(16);
3638   const LLT V2S16 = LLT::vector(2, 16);
3639 
3640   for (int I = AddrIdx; I < EndIdx; ++I) {
3641     MachineOperand &SrcOp = MI.getOperand(I);
3642     if (!SrcOp.isReg())
3643       continue; // _L to _LZ may have eliminated this.
3644 
3645     Register AddrReg = SrcOp.getReg();
3646 
3647     if (I < DimIdx) {
3648       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3649       PackedAddrs.push_back(AddrReg);
3650     } else {
3651       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3652       // derivatives dx/dh and dx/dv are packed with undef.
3653       if (((I + 1) >= EndIdx) ||
3654           ((NumGradients / 2) % 2 == 1 &&
3655            (I == DimIdx + (NumGradients / 2) - 1 ||
3656             I == DimIdx + NumGradients - 1)) ||
3657           // Check for _L to _LZ optimization
3658           !MI.getOperand(I + 1).isReg()) {
3659         PackedAddrs.push_back(
3660             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3661                 .getReg(0));
3662       } else {
3663         PackedAddrs.push_back(
3664             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3665                 .getReg(0));
3666         ++I;
3667       }
3668     }
3669   }
3670 }
3671 
3672 /// Convert from separate vaddr components to a single vector address register,
3673 /// and replace the remaining operands with $noreg.
3674 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3675                                      int DimIdx, int NumVAddrs) {
3676   const LLT S32 = LLT::scalar(32);
3677 
3678   SmallVector<Register, 8> AddrRegs;
3679   for (int I = 0; I != NumVAddrs; ++I) {
3680     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3681     if (SrcOp.isReg()) {
3682       AddrRegs.push_back(SrcOp.getReg());
3683       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3684     }
3685   }
3686 
3687   int NumAddrRegs = AddrRegs.size();
3688   if (NumAddrRegs != 1) {
3689     // Round up to 8 elements for v5-v7
3690     // FIXME: Missing intermediate sized register classes and instructions.
3691     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3692       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3693       auto Undef = B.buildUndef(S32);
3694       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3695       NumAddrRegs = RoundedNumRegs;
3696     }
3697 
3698     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3699     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3700   }
3701 
3702   for (int I = 1; I != NumVAddrs; ++I) {
3703     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3704     if (SrcOp.isReg())
3705       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3706   }
3707 }
3708 
3709 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3710 ///
3711 /// Depending on the subtarget, load/store with 16-bit element data need to be
3712 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3713 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3714 /// registers.
3715 ///
3716 /// We don't want to directly select image instructions just yet, but also want
3717 /// to exposes all register repacking to the legalizer/combiners. We also don't
3718 /// want a selected instrution entering RegBankSelect. In order to avoid
3719 /// defining a multitude of intermediate image instructions, directly hack on
3720 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3721 /// now unnecessary arguments with $noreg.
3722 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3723     MachineInstr &MI, MachineIRBuilder &B,
3724     GISelChangeObserver &Observer,
3725     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3726 
3727   const int NumDefs = MI.getNumExplicitDefs();
3728   bool IsTFE = NumDefs == 2;
3729   // We are only processing the operands of d16 image operations on subtargets
3730   // that use the unpacked register layout, or need to repack the TFE result.
3731 
3732   // TODO: Do we need to guard against already legalized intrinsics?
3733   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3734     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3735 
3736   MachineRegisterInfo *MRI = B.getMRI();
3737   const LLT S32 = LLT::scalar(32);
3738   const LLT S16 = LLT::scalar(16);
3739   const LLT V2S16 = LLT::vector(2, 16);
3740 
3741   // Index of first address argument
3742   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3743 
3744   int NumVAddrs, NumGradients;
3745   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3746   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3747     getDMaskIdx(BaseOpcode, NumDefs);
3748   unsigned DMask = 0;
3749 
3750   // Check for 16 bit addresses and pack if true.
3751   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3752   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3753   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3754   const bool IsG16 = GradTy == S16;
3755   const bool IsA16 = AddrTy == S16;
3756 
3757   int DMaskLanes = 0;
3758   if (!BaseOpcode->Atomic) {
3759     DMask = MI.getOperand(DMaskIdx).getImm();
3760     if (BaseOpcode->Gather4) {
3761       DMaskLanes = 4;
3762     } else if (DMask != 0) {
3763       DMaskLanes = countPopulation(DMask);
3764     } else if (!IsTFE && !BaseOpcode->Store) {
3765       // If dmask is 0, this is a no-op load. This can be eliminated.
3766       B.buildUndef(MI.getOperand(0));
3767       MI.eraseFromParent();
3768       return true;
3769     }
3770   }
3771 
3772   Observer.changingInstr(MI);
3773   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3774 
3775   unsigned NewOpcode = NumDefs == 0 ?
3776     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3777 
3778   // Track that we legalized this
3779   MI.setDesc(B.getTII().get(NewOpcode));
3780 
3781   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3782   // dmask to be at least 1 otherwise the instruction will fail
3783   if (IsTFE && DMask == 0) {
3784     DMask = 0x1;
3785     DMaskLanes = 1;
3786     MI.getOperand(DMaskIdx).setImm(DMask);
3787   }
3788 
3789   if (BaseOpcode->Atomic) {
3790     Register VData0 = MI.getOperand(2).getReg();
3791     LLT Ty = MRI->getType(VData0);
3792 
3793     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3794     if (Ty.isVector())
3795       return false;
3796 
3797     if (BaseOpcode->AtomicX2) {
3798       Register VData1 = MI.getOperand(3).getReg();
3799       // The two values are packed in one register.
3800       LLT PackedTy = LLT::vector(2, Ty);
3801       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3802       MI.getOperand(2).setReg(Concat.getReg(0));
3803       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3804     }
3805   }
3806 
3807   int CorrectedNumVAddrs = NumVAddrs;
3808 
3809   // Optimize _L to _LZ when _L is zero
3810   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3811         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3812     const ConstantFP *ConstantLod;
3813     const int LodIdx = AddrIdx + NumVAddrs - 1;
3814 
3815     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3816       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3817         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3818         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3819           LZMappingInfo->LZ, ImageDimIntr->Dim);
3820 
3821         // The starting indexes should remain in the same place.
3822         --NumVAddrs;
3823         --CorrectedNumVAddrs;
3824 
3825         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3826           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3827         MI.RemoveOperand(LodIdx);
3828       }
3829     }
3830   }
3831 
3832   // Optimize _mip away, when 'lod' is zero
3833   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3834     int64_t ConstantLod;
3835     const int LodIdx = AddrIdx + NumVAddrs - 1;
3836 
3837     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3838       if (ConstantLod == 0) {
3839         // TODO: Change intrinsic opcode and remove operand instead or replacing
3840         // it with 0, as the _L to _LZ handling is done above.
3841         MI.getOperand(LodIdx).ChangeToImmediate(0);
3842         --CorrectedNumVAddrs;
3843       }
3844     }
3845   }
3846 
3847   // Rewrite the addressing register layout before doing anything else.
3848   if (IsA16 || IsG16) {
3849     if (IsA16) {
3850       // Target must support the feature and gradients need to be 16 bit too
3851       if (!ST.hasA16() || !IsG16)
3852         return false;
3853     } else if (!ST.hasG16())
3854       return false;
3855 
3856     if (NumVAddrs > 1) {
3857       SmallVector<Register, 4> PackedRegs;
3858       // Don't compress addresses for G16
3859       const int PackEndIdx =
3860           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3861       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3862                                   PackEndIdx, NumGradients);
3863 
3864       if (!IsA16) {
3865         // Add uncompressed address
3866         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3867           int AddrReg = MI.getOperand(I).getReg();
3868           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3869           PackedRegs.push_back(AddrReg);
3870         }
3871       }
3872 
3873       // See also below in the non-a16 branch
3874       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3875 
3876       if (!UseNSA && PackedRegs.size() > 1) {
3877         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3878         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3879         PackedRegs[0] = Concat.getReg(0);
3880         PackedRegs.resize(1);
3881       }
3882 
3883       const int NumPacked = PackedRegs.size();
3884       for (int I = 0; I != NumVAddrs; ++I) {
3885         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3886         if (!SrcOp.isReg()) {
3887           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3888           continue;
3889         }
3890 
3891         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3892 
3893         if (I < NumPacked)
3894           SrcOp.setReg(PackedRegs[I]);
3895         else
3896           SrcOp.setReg(AMDGPU::NoRegister);
3897       }
3898     }
3899   } else {
3900     // If the register allocator cannot place the address registers contiguously
3901     // without introducing moves, then using the non-sequential address encoding
3902     // is always preferable, since it saves VALU instructions and is usually a
3903     // wash in terms of code size or even better.
3904     //
3905     // However, we currently have no way of hinting to the register allocator
3906     // that MIMG addresses should be placed contiguously when it is possible to
3907     // do so, so force non-NSA for the common 2-address case as a heuristic.
3908     //
3909     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3910     // allocation when possible.
3911     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3912 
3913     if (!UseNSA && NumVAddrs > 1)
3914       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3915   }
3916 
3917   int Flags = 0;
3918   if (IsA16)
3919     Flags |= 1;
3920   if (IsG16)
3921     Flags |= 2;
3922   MI.addOperand(MachineOperand::CreateImm(Flags));
3923 
3924   if (BaseOpcode->Store) { // No TFE for stores?
3925     // TODO: Handle dmask trim
3926     Register VData = MI.getOperand(1).getReg();
3927     LLT Ty = MRI->getType(VData);
3928     if (!Ty.isVector() || Ty.getElementType() != S16)
3929       return true;
3930 
3931     Register RepackedReg = handleD16VData(B, *MRI, VData);
3932     if (RepackedReg != VData) {
3933       MI.getOperand(1).setReg(RepackedReg);
3934     }
3935 
3936     return true;
3937   }
3938 
3939   Register DstReg = MI.getOperand(0).getReg();
3940   LLT Ty = MRI->getType(DstReg);
3941   const LLT EltTy = Ty.getScalarType();
3942   const bool IsD16 = Ty.getScalarType() == S16;
3943   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3944 
3945   // Confirm that the return type is large enough for the dmask specified
3946   if (NumElts < DMaskLanes)
3947     return false;
3948 
3949   if (NumElts > 4 || DMaskLanes > 4)
3950     return false;
3951 
3952   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3953   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3954 
3955   // The raw dword aligned data component of the load. The only legal cases
3956   // where this matters should be when using the packed D16 format, for
3957   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3958   LLT RoundedTy;
3959 
3960   // S32 vector to to cover all data, plus TFE result element.
3961   LLT TFETy;
3962 
3963   // Register type to use for each loaded component. Will be S32 or V2S16.
3964   LLT RegTy;
3965 
3966   if (IsD16 && ST.hasUnpackedD16VMem()) {
3967     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3968     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3969     RegTy = S32;
3970   } else {
3971     unsigned EltSize = EltTy.getSizeInBits();
3972     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3973     unsigned RoundedSize = 32 * RoundedElts;
3974     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3975     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3976     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3977   }
3978 
3979   // The return type does not need adjustment.
3980   // TODO: Should we change s16 case to s32 or <2 x s16>?
3981   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3982     return true;
3983 
3984   Register Dst1Reg;
3985 
3986   // Insert after the instruction.
3987   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3988 
3989   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3990   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3991   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3992   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3993 
3994   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3995 
3996   MI.getOperand(0).setReg(NewResultReg);
3997 
3998   // In the IR, TFE is supposed to be used with a 2 element struct return
3999   // type. The intruction really returns these two values in one contiguous
4000   // register, with one additional dword beyond the loaded data. Rewrite the
4001   // return type to use a single register result.
4002 
4003   if (IsTFE) {
4004     Dst1Reg = MI.getOperand(1).getReg();
4005     if (MRI->getType(Dst1Reg) != S32)
4006       return false;
4007 
4008     // TODO: Make sure the TFE operand bit is set.
4009     MI.RemoveOperand(1);
4010 
4011     // Handle the easy case that requires no repack instructions.
4012     if (Ty == S32) {
4013       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4014       return true;
4015     }
4016   }
4017 
4018   // Now figure out how to copy the new result register back into the old
4019   // result.
4020   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4021 
4022   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4023 
4024   if (ResultNumRegs == 1) {
4025     assert(!IsTFE);
4026     ResultRegs[0] = NewResultReg;
4027   } else {
4028     // We have to repack into a new vector of some kind.
4029     for (int I = 0; I != NumDataRegs; ++I)
4030       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4031     B.buildUnmerge(ResultRegs, NewResultReg);
4032 
4033     // Drop the final TFE element to get the data part. The TFE result is
4034     // directly written to the right place already.
4035     if (IsTFE)
4036       ResultRegs.resize(NumDataRegs);
4037   }
4038 
4039   // For an s16 scalar result, we form an s32 result with a truncate regardless
4040   // of packed vs. unpacked.
4041   if (IsD16 && !Ty.isVector()) {
4042     B.buildTrunc(DstReg, ResultRegs[0]);
4043     return true;
4044   }
4045 
4046   // Avoid a build/concat_vector of 1 entry.
4047   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4048     B.buildBitcast(DstReg, ResultRegs[0]);
4049     return true;
4050   }
4051 
4052   assert(Ty.isVector());
4053 
4054   if (IsD16) {
4055     // For packed D16 results with TFE enabled, all the data components are
4056     // S32. Cast back to the expected type.
4057     //
4058     // TODO: We don't really need to use load s32 elements. We would only need one
4059     // cast for the TFE result if a multiple of v2s16 was used.
4060     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4061       for (Register &Reg : ResultRegs)
4062         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4063     } else if (ST.hasUnpackedD16VMem()) {
4064       for (Register &Reg : ResultRegs)
4065         Reg = B.buildTrunc(S16, Reg).getReg(0);
4066     }
4067   }
4068 
4069   auto padWithUndef = [&](LLT Ty, int NumElts) {
4070     if (NumElts == 0)
4071       return;
4072     Register Undef = B.buildUndef(Ty).getReg(0);
4073     for (int I = 0; I != NumElts; ++I)
4074       ResultRegs.push_back(Undef);
4075   };
4076 
4077   // Pad out any elements eliminated due to the dmask.
4078   LLT ResTy = MRI->getType(ResultRegs[0]);
4079   if (!ResTy.isVector()) {
4080     padWithUndef(ResTy, NumElts - ResultRegs.size());
4081     B.buildBuildVector(DstReg, ResultRegs);
4082     return true;
4083   }
4084 
4085   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4086   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4087 
4088   // Deal with the one annoying legal case.
4089   const LLT V3S16 = LLT::vector(3, 16);
4090   if (Ty == V3S16) {
4091     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4092     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4093     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4094     return true;
4095   }
4096 
4097   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4098   B.buildConcatVectors(DstReg, ResultRegs);
4099   return true;
4100 }
4101 
4102 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4103   MachineInstr &MI, MachineIRBuilder &B,
4104   GISelChangeObserver &Observer) const {
4105   Register Dst = MI.getOperand(0).getReg();
4106   LLT Ty = B.getMRI()->getType(Dst);
4107   unsigned Size = Ty.getSizeInBits();
4108   MachineFunction &MF = B.getMF();
4109 
4110   Observer.changingInstr(MI);
4111 
4112   // FIXME: We don't really need this intermediate instruction. The intrinsic
4113   // should be fixed to have a memory operand. Since it's readnone, we're not
4114   // allowed to add one.
4115   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4116   MI.RemoveOperand(1); // Remove intrinsic ID
4117 
4118   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4119   // TODO: Should this use datalayout alignment?
4120   const unsigned MemSize = (Size + 7) / 8;
4121   const Align MemAlign(4);
4122   MachineMemOperand *MMO = MF.getMachineMemOperand(
4123       MachinePointerInfo(),
4124       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4125           MachineMemOperand::MOInvariant,
4126       MemSize, MemAlign);
4127   MI.addMemOperand(MF, MMO);
4128 
4129   // There are no 96-bit result scalar loads, but widening to 128-bit should
4130   // always be legal. We may need to restore this to a 96-bit result if it turns
4131   // out this needs to be converted to a vector load during RegBankSelect.
4132   if (!isPowerOf2_32(Size)) {
4133     LegalizerHelper Helper(MF, *this, Observer, B);
4134 
4135     if (Ty.isVector())
4136       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4137     else
4138       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4139   }
4140 
4141   Observer.changedInstr(MI);
4142   return true;
4143 }
4144 
4145 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4146                                                 MachineRegisterInfo &MRI,
4147                                                 MachineIRBuilder &B) const {
4148   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4149   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4150       !ST.isTrapHandlerEnabled()) {
4151     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4152   } else {
4153     // Pass queue pointer to trap handler as input, and insert trap instruction
4154     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4155     MachineRegisterInfo &MRI = *B.getMRI();
4156     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4157     Register LiveIn = getLiveInRegister(
4158         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4159         /*InsertLiveInCopy=*/false);
4160     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4161       return false;
4162     B.buildCopy(SGPR01, LiveIn);
4163     B.buildInstr(AMDGPU::S_TRAP)
4164         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4165         .addReg(SGPR01, RegState::Implicit);
4166   }
4167 
4168   MI.eraseFromParent();
4169   return true;
4170 }
4171 
4172 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4173     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4174   // Is non-HSA path or trap-handler disabled? then, report a warning
4175   // accordingly
4176   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4177       !ST.isTrapHandlerEnabled()) {
4178     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4179                                      "debugtrap handler not supported",
4180                                      MI.getDebugLoc(), DS_Warning);
4181     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4182     Ctx.diagnose(NoTrap);
4183   } else {
4184     // Insert debug-trap instruction
4185     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4186   }
4187 
4188   MI.eraseFromParent();
4189   return true;
4190 }
4191 
4192 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4193                                             MachineInstr &MI) const {
4194   MachineIRBuilder &B = Helper.MIRBuilder;
4195   MachineRegisterInfo &MRI = *B.getMRI();
4196 
4197   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4198   auto IntrID = MI.getIntrinsicID();
4199   switch (IntrID) {
4200   case Intrinsic::amdgcn_if:
4201   case Intrinsic::amdgcn_else: {
4202     MachineInstr *Br = nullptr;
4203     MachineBasicBlock *UncondBrTarget = nullptr;
4204     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4205       const SIRegisterInfo *TRI
4206         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4207 
4208       Register Def = MI.getOperand(1).getReg();
4209       Register Use = MI.getOperand(3).getReg();
4210 
4211       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4212       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4213       if (IntrID == Intrinsic::amdgcn_if) {
4214         B.buildInstr(AMDGPU::SI_IF)
4215           .addDef(Def)
4216           .addUse(Use)
4217           .addMBB(UncondBrTarget);
4218       } else {
4219         B.buildInstr(AMDGPU::SI_ELSE)
4220           .addDef(Def)
4221           .addUse(Use)
4222           .addMBB(UncondBrTarget)
4223           .addImm(0);
4224       }
4225 
4226       if (Br) {
4227         Br->getOperand(0).setMBB(CondBrTarget);
4228       } else {
4229         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4230         // since we're swapping branch targets it needs to be reinserted.
4231         // FIXME: IRTranslator should probably not do this
4232         B.buildBr(*CondBrTarget);
4233       }
4234 
4235       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4236       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4237       MI.eraseFromParent();
4238       BrCond->eraseFromParent();
4239       return true;
4240     }
4241 
4242     return false;
4243   }
4244   case Intrinsic::amdgcn_loop: {
4245     MachineInstr *Br = nullptr;
4246     MachineBasicBlock *UncondBrTarget = nullptr;
4247     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4248       const SIRegisterInfo *TRI
4249         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4250 
4251       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4252       Register Reg = MI.getOperand(2).getReg();
4253 
4254       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4255       B.buildInstr(AMDGPU::SI_LOOP)
4256         .addUse(Reg)
4257         .addMBB(UncondBrTarget);
4258 
4259       if (Br)
4260         Br->getOperand(0).setMBB(CondBrTarget);
4261       else
4262         B.buildBr(*CondBrTarget);
4263 
4264       MI.eraseFromParent();
4265       BrCond->eraseFromParent();
4266       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4267       return true;
4268     }
4269 
4270     return false;
4271   }
4272   case Intrinsic::amdgcn_kernarg_segment_ptr:
4273     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4274       // This only makes sense to call in a kernel, so just lower to null.
4275       B.buildConstant(MI.getOperand(0).getReg(), 0);
4276       MI.eraseFromParent();
4277       return true;
4278     }
4279 
4280     return legalizePreloadedArgIntrin(
4281       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4282   case Intrinsic::amdgcn_implicitarg_ptr:
4283     return legalizeImplicitArgPtr(MI, MRI, B);
4284   case Intrinsic::amdgcn_workitem_id_x:
4285     return legalizePreloadedArgIntrin(MI, MRI, B,
4286                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4287   case Intrinsic::amdgcn_workitem_id_y:
4288     return legalizePreloadedArgIntrin(MI, MRI, B,
4289                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4290   case Intrinsic::amdgcn_workitem_id_z:
4291     return legalizePreloadedArgIntrin(MI, MRI, B,
4292                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4293   case Intrinsic::amdgcn_workgroup_id_x:
4294     return legalizePreloadedArgIntrin(MI, MRI, B,
4295                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4296   case Intrinsic::amdgcn_workgroup_id_y:
4297     return legalizePreloadedArgIntrin(MI, MRI, B,
4298                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4299   case Intrinsic::amdgcn_workgroup_id_z:
4300     return legalizePreloadedArgIntrin(MI, MRI, B,
4301                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4302   case Intrinsic::amdgcn_dispatch_ptr:
4303     return legalizePreloadedArgIntrin(MI, MRI, B,
4304                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4305   case Intrinsic::amdgcn_queue_ptr:
4306     return legalizePreloadedArgIntrin(MI, MRI, B,
4307                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4308   case Intrinsic::amdgcn_implicit_buffer_ptr:
4309     return legalizePreloadedArgIntrin(
4310       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4311   case Intrinsic::amdgcn_dispatch_id:
4312     return legalizePreloadedArgIntrin(MI, MRI, B,
4313                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4314   case Intrinsic::amdgcn_fdiv_fast:
4315     return legalizeFDIVFastIntrin(MI, MRI, B);
4316   case Intrinsic::amdgcn_is_shared:
4317     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4318   case Intrinsic::amdgcn_is_private:
4319     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4320   case Intrinsic::amdgcn_wavefrontsize: {
4321     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4322     MI.eraseFromParent();
4323     return true;
4324   }
4325   case Intrinsic::amdgcn_s_buffer_load:
4326     return legalizeSBufferLoad(MI, B, Helper.Observer);
4327   case Intrinsic::amdgcn_raw_buffer_store:
4328   case Intrinsic::amdgcn_struct_buffer_store:
4329     return legalizeBufferStore(MI, MRI, B, false, false);
4330   case Intrinsic::amdgcn_raw_buffer_store_format:
4331   case Intrinsic::amdgcn_struct_buffer_store_format:
4332     return legalizeBufferStore(MI, MRI, B, false, true);
4333   case Intrinsic::amdgcn_raw_tbuffer_store:
4334   case Intrinsic::amdgcn_struct_tbuffer_store:
4335     return legalizeBufferStore(MI, MRI, B, true, true);
4336   case Intrinsic::amdgcn_raw_buffer_load:
4337   case Intrinsic::amdgcn_struct_buffer_load:
4338     return legalizeBufferLoad(MI, MRI, B, false, false);
4339   case Intrinsic::amdgcn_raw_buffer_load_format:
4340   case Intrinsic::amdgcn_struct_buffer_load_format:
4341     return legalizeBufferLoad(MI, MRI, B, true, false);
4342   case Intrinsic::amdgcn_raw_tbuffer_load:
4343   case Intrinsic::amdgcn_struct_tbuffer_load:
4344     return legalizeBufferLoad(MI, MRI, B, true, true);
4345   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4346   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4347   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4348   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4349   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4350   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4351   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4352   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4353   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4354   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4355   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4356   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4357   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4358   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4359   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4360   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4361   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4362   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4363   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4364   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4365   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4366   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4367   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4368   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4369   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4370   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4371     return legalizeBufferAtomic(MI, B, IntrID);
4372   case Intrinsic::amdgcn_atomic_inc:
4373     return legalizeAtomicIncDec(MI, B, true);
4374   case Intrinsic::amdgcn_atomic_dec:
4375     return legalizeAtomicIncDec(MI, B, false);
4376   case Intrinsic::trap:
4377     return legalizeTrapIntrinsic(MI, MRI, B);
4378   case Intrinsic::debugtrap:
4379     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4380   default: {
4381     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4382             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4383       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4384     return true;
4385   }
4386   }
4387 
4388   return true;
4389 }
4390