1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .clampScalar(0, S32, S256)
419     .widenScalarToNextPow2(0, 32)
420     .clampMaxNumElements(0, S32, 16)
421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422     .legalIf(isPointer(0));
423 
424   if (ST.hasVOP3PInsts()) {
425     assert(ST.hasIntClamp() && "all targets with VOP3P should support clamp");
426     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
427       .legalFor({S32, S16, V2S16})
428       .clampScalar(0, S16, S32)
429       .clampMaxNumElements(0, S16, 2)
430       .scalarize(0)
431       .widenScalarToNextPow2(0, 32);
432 
433     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
434       .lowerFor({S32, S16, V2S16}) // FIXME: legal and merge with add/sub/mul
435       .minScalar(0, S16)
436       .clampMaxNumElements(0, S16, 2)
437       .scalarize(0)
438       .widenScalarToNextPow2(0, 32)
439       .lower();
440   } else if (ST.has16BitInsts()) {
441     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
442       .legalFor({S32, S16})
443       .clampScalar(0, S16, S32)
444       .scalarize(0)
445       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
446 
447     assert(ST.hasIntClamp() && "all targets with 16-bit should support clamp");
448 
449     // Technically the saturating operations require clamp bit support, but this
450     // was introduced at the same time as 16-bit operations.
451     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
452       .lowerFor({S32, S16}) // FIXME: legal with clamp modifier
453       .minScalar(0, S16)
454       .scalarize(0)
455       .widenScalarToNextPow2(0, 16)
456       .lower();
457 
458     // We're just lowering this, but it helps get a better result to try to
459     // coerce to the desired type first.
460     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
461       .minScalar(0, S16)
462       .scalarize(0)
463       .lower();
464   } else {
465     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
466       .legalFor({S32})
467       .clampScalar(0, S32, S32)
468       .scalarize(0);
469 
470     if (ST.hasIntClamp()) {
471       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
472         .lowerFor({S32}) // FIXME: legal with clamp modifier.
473         .scalarize(0)
474         .minScalarOrElt(0, S32)
475         .lower();
476     } else {
477       // Clamp bit support was added in VI, along with 16-bit operations.
478       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
479         .minScalar(0, S32)
480         .scalarize(0)
481         .lower();
482     }
483 
484     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
485       .minScalar(0, S32)
486       .scalarize(0)
487       .lower();
488   }
489 
490   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
491     .customFor({S32, S64})
492     .clampScalar(0, S32, S64)
493     .widenScalarToNextPow2(0, 32)
494     .scalarize(0);
495 
496   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
497     .legalFor({S32})
498     .clampScalar(0, S32, S32)
499     .scalarize(0);
500 
501   // Report legal for any types we can handle anywhere. For the cases only legal
502   // on the SALU, RegBankSelect will be able to re-legalize.
503   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
504     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
505     .clampScalar(0, S32, S64)
506     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
507     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
508     .widenScalarToNextPow2(0)
509     .scalarize(0);
510 
511   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
512                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
513     .legalFor({{S32, S1}, {S32, S32}})
514     .minScalar(0, S32)
515     // TODO: .scalarize(0)
516     .lower();
517 
518   getActionDefinitionsBuilder(G_BITCAST)
519     // Don't worry about the size constraint.
520     .legalIf(all(isRegisterType(0), isRegisterType(1)))
521     .lower();
522 
523 
524   getActionDefinitionsBuilder(G_CONSTANT)
525     .legalFor({S1, S32, S64, S16, GlobalPtr,
526                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
527     .clampScalar(0, S32, S64)
528     .widenScalarToNextPow2(0)
529     .legalIf(isPointer(0));
530 
531   getActionDefinitionsBuilder(G_FCONSTANT)
532     .legalFor({S32, S64, S16})
533     .clampScalar(0, S16, S64);
534 
535   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
536       .legalIf(isRegisterType(0))
537       // s1 and s16 are special cases because they have legal operations on
538       // them, but don't really occupy registers in the normal way.
539       .legalFor({S1, S16})
540       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541       .clampScalarOrElt(0, S32, MaxScalar)
542       .widenScalarToNextPow2(0, 32)
543       .clampMaxNumElements(0, S32, 16);
544 
545   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
546 
547   // If the amount is divergent, we have to do a wave reduction to get the
548   // maximum value, so this is expanded during RegBankSelect.
549   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
550     .legalFor({{PrivatePtr, S32}});
551 
552   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
553     .unsupportedFor({PrivatePtr})
554     .custom();
555   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
556 
557   auto &FPOpActions = getActionDefinitionsBuilder(
558     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
559     .legalFor({S32, S64});
560   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
561     .customFor({S32, S64});
562   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
563     .customFor({S32, S64});
564 
565   if (ST.has16BitInsts()) {
566     if (ST.hasVOP3PInsts())
567       FPOpActions.legalFor({S16, V2S16});
568     else
569       FPOpActions.legalFor({S16});
570 
571     TrigActions.customFor({S16});
572     FDIVActions.customFor({S16});
573   }
574 
575   auto &MinNumMaxNum = getActionDefinitionsBuilder({
576       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
577 
578   if (ST.hasVOP3PInsts()) {
579     MinNumMaxNum.customFor(FPTypesPK16)
580       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
581       .clampMaxNumElements(0, S16, 2)
582       .clampScalar(0, S16, S64)
583       .scalarize(0);
584   } else if (ST.has16BitInsts()) {
585     MinNumMaxNum.customFor(FPTypes16)
586       .clampScalar(0, S16, S64)
587       .scalarize(0);
588   } else {
589     MinNumMaxNum.customFor(FPTypesBase)
590       .clampScalar(0, S32, S64)
591       .scalarize(0);
592   }
593 
594   if (ST.hasVOP3PInsts())
595     FPOpActions.clampMaxNumElements(0, S16, 2);
596 
597   FPOpActions
598     .scalarize(0)
599     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
600 
601   TrigActions
602     .scalarize(0)
603     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
604 
605   FDIVActions
606     .scalarize(0)
607     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
608 
609   getActionDefinitionsBuilder({G_FNEG, G_FABS})
610     .legalFor(FPTypesPK16)
611     .clampMaxNumElements(0, S16, 2)
612     .scalarize(0)
613     .clampScalar(0, S16, S64);
614 
615   if (ST.has16BitInsts()) {
616     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
617       .legalFor({S32, S64, S16})
618       .scalarize(0)
619       .clampScalar(0, S16, S64);
620   } else {
621     getActionDefinitionsBuilder(G_FSQRT)
622       .legalFor({S32, S64})
623       .scalarize(0)
624       .clampScalar(0, S32, S64);
625 
626     if (ST.hasFractBug()) {
627       getActionDefinitionsBuilder(G_FFLOOR)
628         .customFor({S64})
629         .legalFor({S32, S64})
630         .scalarize(0)
631         .clampScalar(0, S32, S64);
632     } else {
633       getActionDefinitionsBuilder(G_FFLOOR)
634         .legalFor({S32, S64})
635         .scalarize(0)
636         .clampScalar(0, S32, S64);
637     }
638   }
639 
640   getActionDefinitionsBuilder(G_FPTRUNC)
641     .legalFor({{S32, S64}, {S16, S32}})
642     .scalarize(0)
643     .lower();
644 
645   getActionDefinitionsBuilder(G_FPEXT)
646     .legalFor({{S64, S32}, {S32, S16}})
647     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
648     .scalarize(0);
649 
650   getActionDefinitionsBuilder(G_FSUB)
651       // Use actual fsub instruction
652       .legalFor({S32})
653       // Must use fadd + fneg
654       .lowerFor({S64, S16, V2S16})
655       .scalarize(0)
656       .clampScalar(0, S32, S64);
657 
658   // Whether this is legal depends on the floating point mode for the function.
659   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
660   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
661     FMad.customFor({S32, S16});
662   else if (ST.hasMadMacF32Insts())
663     FMad.customFor({S32});
664   else if (ST.hasMadF16())
665     FMad.customFor({S16});
666   FMad.scalarize(0)
667       .lower();
668 
669   // TODO: Do we need to clamp maximum bitwidth?
670   getActionDefinitionsBuilder(G_TRUNC)
671     .legalIf(isScalar(0))
672     .legalFor({{V2S16, V2S32}})
673     .clampMaxNumElements(0, S16, 2)
674     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
675     // situations (like an invalid implicit use), we don't want to infinite loop
676     // in the legalizer.
677     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
678     .alwaysLegal();
679 
680   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
681     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
682                {S32, S1}, {S64, S1}, {S16, S1}})
683     .scalarize(0)
684     .clampScalar(0, S32, S64)
685     .widenScalarToNextPow2(1, 32);
686 
687   // TODO: Split s1->s64 during regbankselect for VALU.
688   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
689     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
690     .lowerFor({{S32, S64}})
691     .lowerIf(typeIs(1, S1))
692     .customFor({{S64, S64}});
693   if (ST.has16BitInsts())
694     IToFP.legalFor({{S16, S16}});
695   IToFP.clampScalar(1, S32, S64)
696        .minScalar(0, S32)
697        .scalarize(0)
698        .widenScalarToNextPow2(1);
699 
700   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
701     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
702     .customFor({{S64, S64}})
703     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
704   if (ST.has16BitInsts())
705     FPToI.legalFor({{S16, S16}});
706   else
707     FPToI.minScalar(1, S32);
708 
709   FPToI.minScalar(0, S32)
710        .scalarize(0)
711        .lower();
712 
713   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
714     .scalarize(0)
715     .lower();
716 
717   if (ST.has16BitInsts()) {
718     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
719       .legalFor({S16, S32, S64})
720       .clampScalar(0, S16, S64)
721       .scalarize(0);
722   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
723     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
724       .legalFor({S32, S64})
725       .clampScalar(0, S32, S64)
726       .scalarize(0);
727   } else {
728     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
729       .legalFor({S32})
730       .customFor({S64})
731       .clampScalar(0, S32, S64)
732       .scalarize(0);
733   }
734 
735   // FIXME: Clamp offset operand.
736   getActionDefinitionsBuilder(G_PTR_ADD)
737     .legalIf(isPointer(0))
738     .scalarize(0);
739 
740   getActionDefinitionsBuilder(G_PTRMASK)
741     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
742     .scalarSameSizeAs(1, 0)
743     .scalarize(0);
744 
745   auto &CmpBuilder =
746     getActionDefinitionsBuilder(G_ICMP)
747     // The compare output type differs based on the register bank of the output,
748     // so make both s1 and s32 legal.
749     //
750     // Scalar compares producing output in scc will be promoted to s32, as that
751     // is the allocatable register type that will be needed for the copy from
752     // scc. This will be promoted during RegBankSelect, and we assume something
753     // before that won't try to use s32 result types.
754     //
755     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
756     // bank.
757     .legalForCartesianProduct(
758       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
759     .legalForCartesianProduct(
760       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
761   if (ST.has16BitInsts()) {
762     CmpBuilder.legalFor({{S1, S16}});
763   }
764 
765   CmpBuilder
766     .widenScalarToNextPow2(1)
767     .clampScalar(1, S32, S64)
768     .scalarize(0)
769     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
770 
771   getActionDefinitionsBuilder(G_FCMP)
772     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
773     .widenScalarToNextPow2(1)
774     .clampScalar(1, S32, S64)
775     .scalarize(0);
776 
777   // FIXME: fpow has a selection pattern that should move to custom lowering.
778   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
779   if (ST.has16BitInsts())
780     Exp2Ops.legalFor({S32, S16});
781   else
782     Exp2Ops.legalFor({S32});
783   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
784   Exp2Ops.scalarize(0);
785 
786   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
787   if (ST.has16BitInsts())
788     ExpOps.customFor({{S32}, {S16}});
789   else
790     ExpOps.customFor({S32});
791   ExpOps.clampScalar(0, MinScalarFPTy, S32)
792         .scalarize(0);
793 
794   getActionDefinitionsBuilder(G_FPOWI)
795     .clampScalar(0, MinScalarFPTy, S32)
796     .lower();
797 
798   // The 64-bit versions produce 32-bit results, but only on the SALU.
799   getActionDefinitionsBuilder(G_CTPOP)
800     .legalFor({{S32, S32}, {S32, S64}})
801     .clampScalar(0, S32, S32)
802     .clampScalar(1, S32, S64)
803     .scalarize(0)
804     .widenScalarToNextPow2(0, 32)
805     .widenScalarToNextPow2(1, 32);
806 
807   // The hardware instructions return a different result on 0 than the generic
808   // instructions expect. The hardware produces -1, but these produce the
809   // bitwidth.
810   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
811     .scalarize(0)
812     .clampScalar(0, S32, S32)
813     .clampScalar(1, S32, S64)
814     .widenScalarToNextPow2(0, 32)
815     .widenScalarToNextPow2(1, 32)
816     .lower();
817 
818   // The 64-bit versions produce 32-bit results, but only on the SALU.
819   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
820     .legalFor({{S32, S32}, {S32, S64}})
821     .clampScalar(0, S32, S32)
822     .clampScalar(1, S32, S64)
823     .scalarize(0)
824     .widenScalarToNextPow2(0, 32)
825     .widenScalarToNextPow2(1, 32);
826 
827   getActionDefinitionsBuilder(G_BITREVERSE)
828     .legalFor({S32})
829     .clampScalar(0, S32, S32)
830     .scalarize(0);
831 
832   if (ST.has16BitInsts()) {
833     getActionDefinitionsBuilder(G_BSWAP)
834       .legalFor({S16, S32, V2S16})
835       .clampMaxNumElements(0, S16, 2)
836       // FIXME: Fixing non-power-of-2 before clamp is workaround for
837       // narrowScalar limitation.
838       .widenScalarToNextPow2(0)
839       .clampScalar(0, S16, S32)
840       .scalarize(0);
841 
842     if (ST.hasVOP3PInsts()) {
843       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
844         .legalFor({S32, S16, V2S16})
845         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
846         .clampMaxNumElements(0, S16, 2)
847         .minScalar(0, S16)
848         .widenScalarToNextPow2(0)
849         .scalarize(0)
850         .lower();
851     } else {
852       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
853         .legalFor({S32, S16})
854         .widenScalarToNextPow2(0)
855         .minScalar(0, S16)
856         .scalarize(0)
857         .lower();
858     }
859   } else {
860     // TODO: Should have same legality without v_perm_b32
861     getActionDefinitionsBuilder(G_BSWAP)
862       .legalFor({S32})
863       .lowerIf(scalarNarrowerThan(0, 32))
864       // FIXME: Fixing non-power-of-2 before clamp is workaround for
865       // narrowScalar limitation.
866       .widenScalarToNextPow2(0)
867       .maxScalar(0, S32)
868       .scalarize(0)
869       .lower();
870 
871     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
872       .legalFor({S32})
873       .minScalar(0, S32)
874       .widenScalarToNextPow2(0)
875       .scalarize(0)
876       .lower();
877   }
878 
879   getActionDefinitionsBuilder(G_INTTOPTR)
880     // List the common cases
881     .legalForCartesianProduct(AddrSpaces64, {S64})
882     .legalForCartesianProduct(AddrSpaces32, {S32})
883     .scalarize(0)
884     // Accept any address space as long as the size matches
885     .legalIf(sameSize(0, 1))
886     .widenScalarIf(smallerThan(1, 0),
887       [](const LegalityQuery &Query) {
888         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
889       })
890     .narrowScalarIf(largerThan(1, 0),
891       [](const LegalityQuery &Query) {
892         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
893       });
894 
895   getActionDefinitionsBuilder(G_PTRTOINT)
896     // List the common cases
897     .legalForCartesianProduct(AddrSpaces64, {S64})
898     .legalForCartesianProduct(AddrSpaces32, {S32})
899     .scalarize(0)
900     // Accept any address space as long as the size matches
901     .legalIf(sameSize(0, 1))
902     .widenScalarIf(smallerThan(0, 1),
903       [](const LegalityQuery &Query) {
904         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
905       })
906     .narrowScalarIf(
907       largerThan(0, 1),
908       [](const LegalityQuery &Query) {
909         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
910       });
911 
912   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
913     .scalarize(0)
914     .custom();
915 
916   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
917                                     bool IsLoad) -> bool {
918     const LLT DstTy = Query.Types[0];
919 
920     // Split vector extloads.
921     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
922     unsigned Align = Query.MMODescrs[0].AlignInBits;
923 
924     if (MemSize < DstTy.getSizeInBits())
925       MemSize = std::max(MemSize, Align);
926 
927     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
928       return true;
929 
930     const LLT PtrTy = Query.Types[1];
931     unsigned AS = PtrTy.getAddressSpace();
932     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
933       return true;
934 
935     // Catch weird sized loads that don't evenly divide into the access sizes
936     // TODO: May be able to widen depending on alignment etc.
937     unsigned NumRegs = (MemSize + 31) / 32;
938     if (NumRegs == 3) {
939       if (!ST.hasDwordx3LoadStores())
940         return true;
941     } else {
942       // If the alignment allows, these should have been widened.
943       if (!isPowerOf2_32(NumRegs))
944         return true;
945     }
946 
947     if (Align < MemSize) {
948       const SITargetLowering *TLI = ST.getTargetLowering();
949       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
950     }
951 
952     return false;
953   };
954 
955   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
956                                          unsigned Opc) -> bool {
957     unsigned Size = Query.Types[0].getSizeInBits();
958     if (isPowerOf2_32(Size))
959       return false;
960 
961     if (Size == 96 && ST.hasDwordx3LoadStores())
962       return false;
963 
964     unsigned AddrSpace = Query.Types[1].getAddressSpace();
965     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
966       return false;
967 
968     unsigned Align = Query.MMODescrs[0].AlignInBits;
969     unsigned RoundedSize = NextPowerOf2(Size);
970     return (Align >= RoundedSize);
971   };
972 
973   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
974   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
975   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
976 
977   // TODO: Refine based on subtargets which support unaligned access or 128-bit
978   // LDS
979   // TODO: Unsupported flat for SI.
980 
981   for (unsigned Op : {G_LOAD, G_STORE}) {
982     const bool IsStore = Op == G_STORE;
983 
984     auto &Actions = getActionDefinitionsBuilder(Op);
985     // Explicitly list some common cases.
986     // TODO: Does this help compile time at all?
987     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
988                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
989                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
990                                       {S64, GlobalPtr, 64, GlobalAlign32},
991                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
992                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
993                                       {S32, GlobalPtr, 8, GlobalAlign8},
994                                       {S32, GlobalPtr, 16, GlobalAlign16},
995 
996                                       {S32, LocalPtr, 32, 32},
997                                       {S64, LocalPtr, 64, 32},
998                                       {V2S32, LocalPtr, 64, 32},
999                                       {S32, LocalPtr, 8, 8},
1000                                       {S32, LocalPtr, 16, 16},
1001                                       {V2S16, LocalPtr, 32, 32},
1002 
1003                                       {S32, PrivatePtr, 32, 32},
1004                                       {S32, PrivatePtr, 8, 8},
1005                                       {S32, PrivatePtr, 16, 16},
1006                                       {V2S16, PrivatePtr, 32, 32},
1007 
1008                                       {S32, ConstantPtr, 32, GlobalAlign32},
1009                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1010                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1011                                       {S64, ConstantPtr, 64, GlobalAlign32},
1012                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1013     Actions.legalIf(
1014       [=](const LegalityQuery &Query) -> bool {
1015         return isLoadStoreLegal(ST, Query, Op);
1016       });
1017 
1018     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1019     // 64-bits.
1020     //
1021     // TODO: Should generalize bitcast action into coerce, which will also cover
1022     // inserting addrspacecasts.
1023     Actions.customIf(typeIs(1, Constant32Ptr));
1024 
1025     // Turn any illegal element vectors into something easier to deal
1026     // with. These will ultimately produce 32-bit scalar shifts to extract the
1027     // parts anyway.
1028     //
1029     // For odd 16-bit element vectors, prefer to split those into pieces with
1030     // 16-bit vector parts.
1031     Actions.bitcastIf(
1032       [=](const LegalityQuery &Query) -> bool {
1033         const LLT Ty = Query.Types[0];
1034         const unsigned Size = Ty.getSizeInBits();
1035 
1036         if (Size != Query.MMODescrs[0].SizeInBits)
1037           return Size <= 32 && Ty.isVector();
1038 
1039         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
1040           return true;
1041         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
1042                !isRegisterVectorElementType(Ty.getElementType());
1043       }, bitcastToRegisterType(0));
1044 
1045     Actions
1046         .customIf(typeIs(1, Constant32Ptr))
1047         // Widen suitably aligned loads by loading extra elements.
1048         .moreElementsIf([=](const LegalityQuery &Query) {
1049             const LLT Ty = Query.Types[0];
1050             return Op == G_LOAD && Ty.isVector() &&
1051                    shouldWidenLoadResult(Query, Op);
1052           }, moreElementsToNextPow2(0))
1053         .widenScalarIf([=](const LegalityQuery &Query) {
1054             const LLT Ty = Query.Types[0];
1055             return Op == G_LOAD && !Ty.isVector() &&
1056                    shouldWidenLoadResult(Query, Op);
1057           }, widenScalarOrEltToNextPow2(0))
1058         .narrowScalarIf(
1059             [=](const LegalityQuery &Query) -> bool {
1060               return !Query.Types[0].isVector() &&
1061                      needToSplitMemOp(Query, Op == G_LOAD);
1062             },
1063             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1064               const LLT DstTy = Query.Types[0];
1065               const LLT PtrTy = Query.Types[1];
1066 
1067               const unsigned DstSize = DstTy.getSizeInBits();
1068               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1069 
1070               // Split extloads.
1071               if (DstSize > MemSize)
1072                 return std::make_pair(0, LLT::scalar(MemSize));
1073 
1074               if (!isPowerOf2_32(DstSize)) {
1075                 // We're probably decomposing an odd sized store. Try to split
1076                 // to the widest type. TODO: Account for alignment. As-is it
1077                 // should be OK, since the new parts will be further legalized.
1078                 unsigned FloorSize = PowerOf2Floor(DstSize);
1079                 return std::make_pair(0, LLT::scalar(FloorSize));
1080               }
1081 
1082               if (DstSize > 32 && (DstSize % 32 != 0)) {
1083                 // FIXME: Need a way to specify non-extload of larger size if
1084                 // suitably aligned.
1085                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1086               }
1087 
1088               unsigned MaxSize = maxSizeForAddrSpace(ST,
1089                                                      PtrTy.getAddressSpace(),
1090                                                      Op == G_LOAD);
1091               if (MemSize > MaxSize)
1092                 return std::make_pair(0, LLT::scalar(MaxSize));
1093 
1094               unsigned Align = Query.MMODescrs[0].AlignInBits;
1095               return std::make_pair(0, LLT::scalar(Align));
1096             })
1097         .fewerElementsIf(
1098             [=](const LegalityQuery &Query) -> bool {
1099               return Query.Types[0].isVector() &&
1100                      needToSplitMemOp(Query, Op == G_LOAD);
1101             },
1102             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1103               const LLT DstTy = Query.Types[0];
1104               const LLT PtrTy = Query.Types[1];
1105 
1106               LLT EltTy = DstTy.getElementType();
1107               unsigned MaxSize = maxSizeForAddrSpace(ST,
1108                                                      PtrTy.getAddressSpace(),
1109                                                      Op == G_LOAD);
1110 
1111               // FIXME: Handle widened to power of 2 results better. This ends
1112               // up scalarizing.
1113               // FIXME: 3 element stores scalarized on SI
1114 
1115               // Split if it's too large for the address space.
1116               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1117                 unsigned NumElts = DstTy.getNumElements();
1118                 unsigned EltSize = EltTy.getSizeInBits();
1119 
1120                 if (MaxSize % EltSize == 0) {
1121                   return std::make_pair(
1122                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1123                 }
1124 
1125                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1126 
1127                 // FIXME: Refine when odd breakdowns handled
1128                 // The scalars will need to be re-legalized.
1129                 if (NumPieces == 1 || NumPieces >= NumElts ||
1130                     NumElts % NumPieces != 0)
1131                   return std::make_pair(0, EltTy);
1132 
1133                 return std::make_pair(0,
1134                                       LLT::vector(NumElts / NumPieces, EltTy));
1135               }
1136 
1137               // FIXME: We could probably handle weird extending loads better.
1138               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1139               if (DstTy.getSizeInBits() > MemSize)
1140                 return std::make_pair(0, EltTy);
1141 
1142               unsigned EltSize = EltTy.getSizeInBits();
1143               unsigned DstSize = DstTy.getSizeInBits();
1144               if (!isPowerOf2_32(DstSize)) {
1145                 // We're probably decomposing an odd sized store. Try to split
1146                 // to the widest type. TODO: Account for alignment. As-is it
1147                 // should be OK, since the new parts will be further legalized.
1148                 unsigned FloorSize = PowerOf2Floor(DstSize);
1149                 return std::make_pair(
1150                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1151               }
1152 
1153               // Need to split because of alignment.
1154               unsigned Align = Query.MMODescrs[0].AlignInBits;
1155               if (EltSize > Align &&
1156                   (EltSize / Align < DstTy.getNumElements())) {
1157                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1158               }
1159 
1160               // May need relegalization for the scalars.
1161               return std::make_pair(0, EltTy);
1162             })
1163         .minScalar(0, S32);
1164 
1165     if (IsStore)
1166       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1167 
1168     // TODO: Need a bitcast lower option?
1169     Actions
1170         .widenScalarToNextPow2(0)
1171         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1172   }
1173 
1174   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1175                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1176                                                   {S32, GlobalPtr, 16, 2 * 8},
1177                                                   {S32, LocalPtr, 8, 8},
1178                                                   {S32, LocalPtr, 16, 16},
1179                                                   {S32, PrivatePtr, 8, 8},
1180                                                   {S32, PrivatePtr, 16, 16},
1181                                                   {S32, ConstantPtr, 8, 8},
1182                                                   {S32, ConstantPtr, 16, 2 * 8}});
1183   if (ST.hasFlatAddressSpace()) {
1184     ExtLoads.legalForTypesWithMemDesc(
1185         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1186   }
1187 
1188   ExtLoads.clampScalar(0, S32, S32)
1189           .widenScalarToNextPow2(0)
1190           .unsupportedIfMemSizeNotPow2()
1191           .lower();
1192 
1193   auto &Atomics = getActionDefinitionsBuilder(
1194     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1195      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1196      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1197      G_ATOMICRMW_UMIN})
1198     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1199                {S64, GlobalPtr}, {S64, LocalPtr}});
1200   if (ST.hasFlatAddressSpace()) {
1201     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1202   }
1203 
1204   if (ST.hasLDSFPAtomics()) {
1205     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1206       .legalFor({{S32, LocalPtr}});
1207   }
1208 
1209   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1210   // demarshalling
1211   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1212     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1213                 {S32, FlatPtr}, {S64, FlatPtr}})
1214     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1215                {S32, RegionPtr}, {S64, RegionPtr}});
1216   // TODO: Pointer types, any 32-bit or 64-bit vector
1217 
1218   // Condition should be s32 for scalar, s1 for vector.
1219   getActionDefinitionsBuilder(G_SELECT)
1220     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1221           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1222           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1223     .clampScalar(0, S16, S64)
1224     .scalarize(1)
1225     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1226     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1227     .clampMaxNumElements(0, S32, 2)
1228     .clampMaxNumElements(0, LocalPtr, 2)
1229     .clampMaxNumElements(0, PrivatePtr, 2)
1230     .scalarize(0)
1231     .widenScalarToNextPow2(0)
1232     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1233 
1234   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1235   // be more flexible with the shift amount type.
1236   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1237     .legalFor({{S32, S32}, {S64, S32}});
1238   if (ST.has16BitInsts()) {
1239     if (ST.hasVOP3PInsts()) {
1240       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1241             .clampMaxNumElements(0, S16, 2);
1242     } else
1243       Shifts.legalFor({{S16, S16}});
1244 
1245     // TODO: Support 16-bit shift amounts for all types
1246     Shifts.widenScalarIf(
1247       [=](const LegalityQuery &Query) {
1248         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1249         // 32-bit amount.
1250         const LLT ValTy = Query.Types[0];
1251         const LLT AmountTy = Query.Types[1];
1252         return ValTy.getSizeInBits() <= 16 &&
1253                AmountTy.getSizeInBits() < 16;
1254       }, changeTo(1, S16));
1255     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1256     Shifts.clampScalar(1, S32, S32);
1257     Shifts.clampScalar(0, S16, S64);
1258     Shifts.widenScalarToNextPow2(0, 16);
1259   } else {
1260     // Make sure we legalize the shift amount type first, as the general
1261     // expansion for the shifted type will produce much worse code if it hasn't
1262     // been truncated already.
1263     Shifts.clampScalar(1, S32, S32);
1264     Shifts.clampScalar(0, S32, S64);
1265     Shifts.widenScalarToNextPow2(0, 32);
1266   }
1267   Shifts.scalarize(0);
1268 
1269   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1270     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1271     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1272     unsigned IdxTypeIdx = 2;
1273 
1274     getActionDefinitionsBuilder(Op)
1275       .customIf([=](const LegalityQuery &Query) {
1276           const LLT EltTy = Query.Types[EltTypeIdx];
1277           const LLT VecTy = Query.Types[VecTypeIdx];
1278           const LLT IdxTy = Query.Types[IdxTypeIdx];
1279           return (EltTy.getSizeInBits() == 16 ||
1280                   EltTy.getSizeInBits() % 32 == 0) &&
1281                  VecTy.getSizeInBits() % 32 == 0 &&
1282                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1283                  IdxTy.getSizeInBits() == 32;
1284         })
1285       .clampScalar(EltTypeIdx, S32, S64)
1286       .clampScalar(VecTypeIdx, S32, S64)
1287       .clampScalar(IdxTypeIdx, S32, S32);
1288   }
1289 
1290   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1291     .unsupportedIf([=](const LegalityQuery &Query) {
1292         const LLT &EltTy = Query.Types[1].getElementType();
1293         return Query.Types[0] != EltTy;
1294       });
1295 
1296   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1297     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1298     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1299 
1300     // FIXME: Doesn't handle extract of illegal sizes.
1301     getActionDefinitionsBuilder(Op)
1302       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1303       // FIXME: Multiples of 16 should not be legal.
1304       .legalIf([=](const LegalityQuery &Query) {
1305           const LLT BigTy = Query.Types[BigTyIdx];
1306           const LLT LitTy = Query.Types[LitTyIdx];
1307           return (BigTy.getSizeInBits() % 32 == 0) &&
1308                  (LitTy.getSizeInBits() % 16 == 0);
1309         })
1310       .widenScalarIf(
1311         [=](const LegalityQuery &Query) {
1312           const LLT BigTy = Query.Types[BigTyIdx];
1313           return (BigTy.getScalarSizeInBits() < 16);
1314         },
1315         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1316       .widenScalarIf(
1317         [=](const LegalityQuery &Query) {
1318           const LLT LitTy = Query.Types[LitTyIdx];
1319           return (LitTy.getScalarSizeInBits() < 16);
1320         },
1321         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1322       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1323       .widenScalarToNextPow2(BigTyIdx, 32);
1324 
1325   }
1326 
1327   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1328     .legalForCartesianProduct(AllS32Vectors, {S32})
1329     .legalForCartesianProduct(AllS64Vectors, {S64})
1330     .clampNumElements(0, V16S32, V32S32)
1331     .clampNumElements(0, V2S64, V16S64)
1332     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1333 
1334   if (ST.hasScalarPackInsts()) {
1335     BuildVector
1336       // FIXME: Should probably widen s1 vectors straight to s32
1337       .minScalarOrElt(0, S16)
1338       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1339       .minScalar(1, S32);
1340 
1341     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1342       .legalFor({V2S16, S32})
1343       .lower();
1344     BuildVector.minScalarOrElt(0, S32);
1345   } else {
1346     BuildVector.customFor({V2S16, S16});
1347     BuildVector.minScalarOrElt(0, S32);
1348 
1349     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1350       .customFor({V2S16, S32})
1351       .lower();
1352   }
1353 
1354   BuildVector.legalIf(isRegisterType(0));
1355 
1356   // FIXME: Clamp maximum size
1357   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1358     .legalIf(isRegisterType(0));
1359 
1360   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1361   // pre-legalize.
1362   if (ST.hasVOP3PInsts()) {
1363     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1364       .customFor({V2S16, V2S16})
1365       .lower();
1366   } else
1367     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1368 
1369   // Merge/Unmerge
1370   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1371     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1372     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1373 
1374     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1375       const LLT Ty = Query.Types[TypeIdx];
1376       if (Ty.isVector()) {
1377         const LLT &EltTy = Ty.getElementType();
1378         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1379           return true;
1380         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1381           return true;
1382       }
1383       return false;
1384     };
1385 
1386     auto &Builder = getActionDefinitionsBuilder(Op)
1387       .lowerFor({{S16, V2S16}})
1388       .lowerIf([=](const LegalityQuery &Query) {
1389           const LLT BigTy = Query.Types[BigTyIdx];
1390           return BigTy.getSizeInBits() == 32;
1391         })
1392       // Try to widen to s16 first for small types.
1393       // TODO: Only do this on targets with legal s16 shifts
1394       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1395       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1396       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1397       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1398                            elementTypeIs(1, S16)),
1399                        changeTo(1, V2S16))
1400       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1401       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1402       // valid.
1403       .clampScalar(LitTyIdx, S32, S512)
1404       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1405       // Break up vectors with weird elements into scalars
1406       .fewerElementsIf(
1407         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1408         scalarize(0))
1409       .fewerElementsIf(
1410         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1411         scalarize(1))
1412       .clampScalar(BigTyIdx, S32, MaxScalar);
1413 
1414     if (Op == G_MERGE_VALUES) {
1415       Builder.widenScalarIf(
1416         // TODO: Use 16-bit shifts if legal for 8-bit values?
1417         [=](const LegalityQuery &Query) {
1418           const LLT Ty = Query.Types[LitTyIdx];
1419           return Ty.getSizeInBits() < 32;
1420         },
1421         changeTo(LitTyIdx, S32));
1422     }
1423 
1424     Builder.widenScalarIf(
1425       [=](const LegalityQuery &Query) {
1426         const LLT Ty = Query.Types[BigTyIdx];
1427         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1428           Ty.getSizeInBits() % 16 != 0;
1429       },
1430       [=](const LegalityQuery &Query) {
1431         // Pick the next power of 2, or a multiple of 64 over 128.
1432         // Whichever is smaller.
1433         const LLT &Ty = Query.Types[BigTyIdx];
1434         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1435         if (NewSizeInBits >= 256) {
1436           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1437           if (RoundedTo < NewSizeInBits)
1438             NewSizeInBits = RoundedTo;
1439         }
1440         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1441       })
1442       .legalIf([=](const LegalityQuery &Query) {
1443           const LLT &BigTy = Query.Types[BigTyIdx];
1444           const LLT &LitTy = Query.Types[LitTyIdx];
1445 
1446           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1447             return false;
1448           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1449             return false;
1450 
1451           return BigTy.getSizeInBits() % 16 == 0 &&
1452                  LitTy.getSizeInBits() % 16 == 0 &&
1453                  BigTy.getSizeInBits() <= MaxRegisterSize;
1454         })
1455       // Any vectors left are the wrong size. Scalarize them.
1456       .scalarize(0)
1457       .scalarize(1);
1458   }
1459 
1460   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1461   // RegBankSelect.
1462   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1463     .legalFor({{S32}, {S64}});
1464 
1465   if (ST.hasVOP3PInsts()) {
1466     SextInReg.lowerFor({{V2S16}})
1467       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1468       // get more vector shift opportunities, since we'll get those when
1469       // expanded.
1470       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1471   } else if (ST.has16BitInsts()) {
1472     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1473   } else {
1474     // Prefer to promote to s32 before lowering if we don't have 16-bit
1475     // shifts. This avoid a lot of intermediate truncate and extend operations.
1476     SextInReg.lowerFor({{S32}, {S64}});
1477   }
1478 
1479   SextInReg
1480     .scalarize(0)
1481     .clampScalar(0, S32, S64)
1482     .lower();
1483 
1484   getActionDefinitionsBuilder(G_FSHR)
1485     .legalFor({{S32, S32}})
1486     .scalarize(0)
1487     .lower();
1488 
1489   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1490     .legalFor({S64});
1491 
1492   getActionDefinitionsBuilder({
1493       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1494       G_FCOPYSIGN,
1495 
1496       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1497       G_READ_REGISTER,
1498       G_WRITE_REGISTER,
1499 
1500       G_SADDO, G_SSUBO,
1501 
1502        // TODO: Implement
1503       G_FMINIMUM, G_FMAXIMUM,
1504       G_FSHL
1505     }).lower();
1506 
1507   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1508         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1509         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1510     .unsupported();
1511 
1512   computeTables();
1513   verify(*ST.getInstrInfo());
1514 }
1515 
1516 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1517                                          MachineInstr &MI) const {
1518   MachineIRBuilder &B = Helper.MIRBuilder;
1519   MachineRegisterInfo &MRI = *B.getMRI();
1520   GISelChangeObserver &Observer = Helper.Observer;
1521 
1522   switch (MI.getOpcode()) {
1523   case TargetOpcode::G_ADDRSPACE_CAST:
1524     return legalizeAddrSpaceCast(MI, MRI, B);
1525   case TargetOpcode::G_FRINT:
1526     return legalizeFrint(MI, MRI, B);
1527   case TargetOpcode::G_FCEIL:
1528     return legalizeFceil(MI, MRI, B);
1529   case TargetOpcode::G_INTRINSIC_TRUNC:
1530     return legalizeIntrinsicTrunc(MI, MRI, B);
1531   case TargetOpcode::G_SITOFP:
1532     return legalizeITOFP(MI, MRI, B, true);
1533   case TargetOpcode::G_UITOFP:
1534     return legalizeITOFP(MI, MRI, B, false);
1535   case TargetOpcode::G_FPTOSI:
1536     return legalizeFPTOI(MI, MRI, B, true);
1537   case TargetOpcode::G_FPTOUI:
1538     return legalizeFPTOI(MI, MRI, B, false);
1539   case TargetOpcode::G_FMINNUM:
1540   case TargetOpcode::G_FMAXNUM:
1541   case TargetOpcode::G_FMINNUM_IEEE:
1542   case TargetOpcode::G_FMAXNUM_IEEE:
1543     return legalizeMinNumMaxNum(Helper, MI);
1544   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1545     return legalizeExtractVectorElt(MI, MRI, B);
1546   case TargetOpcode::G_INSERT_VECTOR_ELT:
1547     return legalizeInsertVectorElt(MI, MRI, B);
1548   case TargetOpcode::G_SHUFFLE_VECTOR:
1549     return legalizeShuffleVector(MI, MRI, B);
1550   case TargetOpcode::G_FSIN:
1551   case TargetOpcode::G_FCOS:
1552     return legalizeSinCos(MI, MRI, B);
1553   case TargetOpcode::G_GLOBAL_VALUE:
1554     return legalizeGlobalValue(MI, MRI, B);
1555   case TargetOpcode::G_LOAD:
1556     return legalizeLoad(MI, MRI, B, Observer);
1557   case TargetOpcode::G_FMAD:
1558     return legalizeFMad(MI, MRI, B);
1559   case TargetOpcode::G_FDIV:
1560     return legalizeFDIV(MI, MRI, B);
1561   case TargetOpcode::G_UDIV:
1562   case TargetOpcode::G_UREM:
1563     return legalizeUDIV_UREM(MI, MRI, B);
1564   case TargetOpcode::G_SDIV:
1565   case TargetOpcode::G_SREM:
1566     return legalizeSDIV_SREM(MI, MRI, B);
1567   case TargetOpcode::G_ATOMIC_CMPXCHG:
1568     return legalizeAtomicCmpXChg(MI, MRI, B);
1569   case TargetOpcode::G_FLOG:
1570     return legalizeFlog(MI, B, numbers::ln2f);
1571   case TargetOpcode::G_FLOG10:
1572     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1573   case TargetOpcode::G_FEXP:
1574     return legalizeFExp(MI, B);
1575   case TargetOpcode::G_FPOW:
1576     return legalizeFPow(MI, B);
1577   case TargetOpcode::G_FFLOOR:
1578     return legalizeFFloor(MI, MRI, B);
1579   case TargetOpcode::G_BUILD_VECTOR:
1580     return legalizeBuildVector(MI, MRI, B);
1581   default:
1582     return false;
1583   }
1584 
1585   llvm_unreachable("expected switch to return");
1586 }
1587 
1588 Register AMDGPULegalizerInfo::getSegmentAperture(
1589   unsigned AS,
1590   MachineRegisterInfo &MRI,
1591   MachineIRBuilder &B) const {
1592   MachineFunction &MF = B.getMF();
1593   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1594   const LLT S32 = LLT::scalar(32);
1595 
1596   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1597 
1598   if (ST.hasApertureRegs()) {
1599     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1600     // getreg.
1601     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1602         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1603         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1604     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1605         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1606         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1607     unsigned Encoding =
1608         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1609         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1610         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1611 
1612     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1613 
1614     B.buildInstr(AMDGPU::S_GETREG_B32)
1615       .addDef(GetReg)
1616       .addImm(Encoding);
1617     MRI.setType(GetReg, S32);
1618 
1619     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1620     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1621   }
1622 
1623   Register QueuePtr = MRI.createGenericVirtualRegister(
1624     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1625 
1626   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1627   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1628     return Register();
1629 
1630   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1631   // private_segment_aperture_base_hi.
1632   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1633 
1634   // TODO: can we be smarter about machine pointer info?
1635   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1636   MachineMemOperand *MMO = MF.getMachineMemOperand(
1637       PtrInfo,
1638       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1639           MachineMemOperand::MOInvariant,
1640       4, commonAlignment(Align(64), StructOffset));
1641 
1642   Register LoadAddr;
1643 
1644   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1645   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1646 }
1647 
1648 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1649   MachineInstr &MI, MachineRegisterInfo &MRI,
1650   MachineIRBuilder &B) const {
1651   MachineFunction &MF = B.getMF();
1652 
1653   const LLT S32 = LLT::scalar(32);
1654   Register Dst = MI.getOperand(0).getReg();
1655   Register Src = MI.getOperand(1).getReg();
1656 
1657   LLT DstTy = MRI.getType(Dst);
1658   LLT SrcTy = MRI.getType(Src);
1659   unsigned DestAS = DstTy.getAddressSpace();
1660   unsigned SrcAS = SrcTy.getAddressSpace();
1661 
1662   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1663   // vector element.
1664   assert(!DstTy.isVector());
1665 
1666   const AMDGPUTargetMachine &TM
1667     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1668 
1669   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1670   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1671     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1672     return true;
1673   }
1674 
1675   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1676     // Truncate.
1677     B.buildExtract(Dst, Src, 0);
1678     MI.eraseFromParent();
1679     return true;
1680   }
1681 
1682   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1683     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1684     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1685 
1686     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1687     // another. Merge operands are required to be the same type, but creating an
1688     // extra ptrtoint would be kind of pointless.
1689     auto HighAddr = B.buildConstant(
1690       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1691     B.buildMerge(Dst, {Src, HighAddr});
1692     MI.eraseFromParent();
1693     return true;
1694   }
1695 
1696   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1697     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1698            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1699     unsigned NullVal = TM.getNullPointerValue(DestAS);
1700 
1701     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1702     auto FlatNull = B.buildConstant(SrcTy, 0);
1703 
1704     // Extract low 32-bits of the pointer.
1705     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1706 
1707     auto CmpRes =
1708         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1709     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1710 
1711     MI.eraseFromParent();
1712     return true;
1713   }
1714 
1715   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1716     return false;
1717 
1718   if (!ST.hasFlatAddressSpace())
1719     return false;
1720 
1721   auto SegmentNull =
1722       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1723   auto FlatNull =
1724       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1725 
1726   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1727   if (!ApertureReg.isValid())
1728     return false;
1729 
1730   auto CmpRes =
1731       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1732 
1733   // Coerce the type of the low half of the result so we can use merge_values.
1734   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1735 
1736   // TODO: Should we allow mismatched types but matching sizes in merges to
1737   // avoid the ptrtoint?
1738   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1739   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1740 
1741   MI.eraseFromParent();
1742   return true;
1743 }
1744 
1745 bool AMDGPULegalizerInfo::legalizeFrint(
1746   MachineInstr &MI, MachineRegisterInfo &MRI,
1747   MachineIRBuilder &B) const {
1748   Register Src = MI.getOperand(1).getReg();
1749   LLT Ty = MRI.getType(Src);
1750   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1751 
1752   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1753   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1754 
1755   auto C1 = B.buildFConstant(Ty, C1Val);
1756   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1757 
1758   // TODO: Should this propagate fast-math-flags?
1759   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1760   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1761 
1762   auto C2 = B.buildFConstant(Ty, C2Val);
1763   auto Fabs = B.buildFAbs(Ty, Src);
1764 
1765   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1766   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1767   MI.eraseFromParent();
1768   return true;
1769 }
1770 
1771 bool AMDGPULegalizerInfo::legalizeFceil(
1772   MachineInstr &MI, MachineRegisterInfo &MRI,
1773   MachineIRBuilder &B) const {
1774 
1775   const LLT S1 = LLT::scalar(1);
1776   const LLT S64 = LLT::scalar(64);
1777 
1778   Register Src = MI.getOperand(1).getReg();
1779   assert(MRI.getType(Src) == S64);
1780 
1781   // result = trunc(src)
1782   // if (src > 0.0 && src != result)
1783   //   result += 1.0
1784 
1785   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1786 
1787   const auto Zero = B.buildFConstant(S64, 0.0);
1788   const auto One = B.buildFConstant(S64, 1.0);
1789   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1790   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1791   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1792   auto Add = B.buildSelect(S64, And, One, Zero);
1793 
1794   // TODO: Should this propagate fast-math-flags?
1795   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1796   return true;
1797 }
1798 
1799 static MachineInstrBuilder extractF64Exponent(Register Hi,
1800                                               MachineIRBuilder &B) {
1801   const unsigned FractBits = 52;
1802   const unsigned ExpBits = 11;
1803   LLT S32 = LLT::scalar(32);
1804 
1805   auto Const0 = B.buildConstant(S32, FractBits - 32);
1806   auto Const1 = B.buildConstant(S32, ExpBits);
1807 
1808   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1809     .addUse(Hi)
1810     .addUse(Const0.getReg(0))
1811     .addUse(Const1.getReg(0));
1812 
1813   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1814 }
1815 
1816 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1817   MachineInstr &MI, MachineRegisterInfo &MRI,
1818   MachineIRBuilder &B) const {
1819   const LLT S1 = LLT::scalar(1);
1820   const LLT S32 = LLT::scalar(32);
1821   const LLT S64 = LLT::scalar(64);
1822 
1823   Register Src = MI.getOperand(1).getReg();
1824   assert(MRI.getType(Src) == S64);
1825 
1826   // TODO: Should this use extract since the low half is unused?
1827   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1828   Register Hi = Unmerge.getReg(1);
1829 
1830   // Extract the upper half, since this is where we will find the sign and
1831   // exponent.
1832   auto Exp = extractF64Exponent(Hi, B);
1833 
1834   const unsigned FractBits = 52;
1835 
1836   // Extract the sign bit.
1837   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1838   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1839 
1840   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1841 
1842   const auto Zero32 = B.buildConstant(S32, 0);
1843 
1844   // Extend back to 64-bits.
1845   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1846 
1847   auto Shr = B.buildAShr(S64, FractMask, Exp);
1848   auto Not = B.buildNot(S64, Shr);
1849   auto Tmp0 = B.buildAnd(S64, Src, Not);
1850   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1851 
1852   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1853   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1854 
1855   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1856   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1857   MI.eraseFromParent();
1858   return true;
1859 }
1860 
1861 bool AMDGPULegalizerInfo::legalizeITOFP(
1862   MachineInstr &MI, MachineRegisterInfo &MRI,
1863   MachineIRBuilder &B, bool Signed) const {
1864 
1865   Register Dst = MI.getOperand(0).getReg();
1866   Register Src = MI.getOperand(1).getReg();
1867 
1868   const LLT S64 = LLT::scalar(64);
1869   const LLT S32 = LLT::scalar(32);
1870 
1871   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1872 
1873   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1874 
1875   auto CvtHi = Signed ?
1876     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1877     B.buildUITOFP(S64, Unmerge.getReg(1));
1878 
1879   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1880 
1881   auto ThirtyTwo = B.buildConstant(S32, 32);
1882   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1883     .addUse(CvtHi.getReg(0))
1884     .addUse(ThirtyTwo.getReg(0));
1885 
1886   // TODO: Should this propagate fast-math-flags?
1887   B.buildFAdd(Dst, LdExp, CvtLo);
1888   MI.eraseFromParent();
1889   return true;
1890 }
1891 
1892 // TODO: Copied from DAG implementation. Verify logic and document how this
1893 // actually works.
1894 bool AMDGPULegalizerInfo::legalizeFPTOI(
1895   MachineInstr &MI, MachineRegisterInfo &MRI,
1896   MachineIRBuilder &B, bool Signed) const {
1897 
1898   Register Dst = MI.getOperand(0).getReg();
1899   Register Src = MI.getOperand(1).getReg();
1900 
1901   const LLT S64 = LLT::scalar(64);
1902   const LLT S32 = LLT::scalar(32);
1903 
1904   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1905 
1906   unsigned Flags = MI.getFlags();
1907 
1908   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1909   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1910   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1911 
1912   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1913   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1914   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1915 
1916   auto Hi = Signed ?
1917     B.buildFPTOSI(S32, FloorMul) :
1918     B.buildFPTOUI(S32, FloorMul);
1919   auto Lo = B.buildFPTOUI(S32, Fma);
1920 
1921   B.buildMerge(Dst, { Lo, Hi });
1922   MI.eraseFromParent();
1923 
1924   return true;
1925 }
1926 
1927 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1928                                                MachineInstr &MI) const {
1929   MachineFunction &MF = Helper.MIRBuilder.getMF();
1930   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1931 
1932   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1933                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1934 
1935   // With ieee_mode disabled, the instructions have the correct behavior
1936   // already for G_FMINNUM/G_FMAXNUM
1937   if (!MFI->getMode().IEEE)
1938     return !IsIEEEOp;
1939 
1940   if (IsIEEEOp)
1941     return true;
1942 
1943   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1944 }
1945 
1946 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1947   MachineInstr &MI, MachineRegisterInfo &MRI,
1948   MachineIRBuilder &B) const {
1949   // TODO: Should move some of this into LegalizerHelper.
1950 
1951   // TODO: Promote dynamic indexing of s16 to s32
1952 
1953   // FIXME: Artifact combiner probably should have replaced the truncated
1954   // constant before this, so we shouldn't need
1955   // getConstantVRegValWithLookThrough.
1956   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1957     MI.getOperand(2).getReg(), MRI);
1958   if (!IdxVal) // Dynamic case will be selected to register indexing.
1959     return true;
1960 
1961   Register Dst = MI.getOperand(0).getReg();
1962   Register Vec = MI.getOperand(1).getReg();
1963 
1964   LLT VecTy = MRI.getType(Vec);
1965   LLT EltTy = VecTy.getElementType();
1966   assert(EltTy == MRI.getType(Dst));
1967 
1968   if (IdxVal->Value < VecTy.getNumElements())
1969     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1970   else
1971     B.buildUndef(Dst);
1972 
1973   MI.eraseFromParent();
1974   return true;
1975 }
1976 
1977 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1978   MachineInstr &MI, MachineRegisterInfo &MRI,
1979   MachineIRBuilder &B) const {
1980   // TODO: Should move some of this into LegalizerHelper.
1981 
1982   // TODO: Promote dynamic indexing of s16 to s32
1983 
1984   // FIXME: Artifact combiner probably should have replaced the truncated
1985   // constant before this, so we shouldn't need
1986   // getConstantVRegValWithLookThrough.
1987   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1988     MI.getOperand(3).getReg(), MRI);
1989   if (!IdxVal) // Dynamic case will be selected to register indexing.
1990     return true;
1991 
1992   Register Dst = MI.getOperand(0).getReg();
1993   Register Vec = MI.getOperand(1).getReg();
1994   Register Ins = MI.getOperand(2).getReg();
1995 
1996   LLT VecTy = MRI.getType(Vec);
1997   LLT EltTy = VecTy.getElementType();
1998   assert(EltTy == MRI.getType(Ins));
1999 
2000   if (IdxVal->Value < VecTy.getNumElements())
2001     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
2002   else
2003     B.buildUndef(Dst);
2004 
2005   MI.eraseFromParent();
2006   return true;
2007 }
2008 
2009 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2010   MachineInstr &MI, MachineRegisterInfo &MRI,
2011   MachineIRBuilder &B) const {
2012   const LLT V2S16 = LLT::vector(2, 16);
2013 
2014   Register Dst = MI.getOperand(0).getReg();
2015   Register Src0 = MI.getOperand(1).getReg();
2016   LLT DstTy = MRI.getType(Dst);
2017   LLT SrcTy = MRI.getType(Src0);
2018 
2019   if (SrcTy == V2S16 && DstTy == V2S16 &&
2020       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2021     return true;
2022 
2023   MachineIRBuilder HelperBuilder(MI);
2024   GISelObserverWrapper DummyObserver;
2025   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2026   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2027 }
2028 
2029 bool AMDGPULegalizerInfo::legalizeSinCos(
2030   MachineInstr &MI, MachineRegisterInfo &MRI,
2031   MachineIRBuilder &B) const {
2032 
2033   Register DstReg = MI.getOperand(0).getReg();
2034   Register SrcReg = MI.getOperand(1).getReg();
2035   LLT Ty = MRI.getType(DstReg);
2036   unsigned Flags = MI.getFlags();
2037 
2038   Register TrigVal;
2039   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2040   if (ST.hasTrigReducedRange()) {
2041     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2042     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2043       .addUse(MulVal.getReg(0))
2044       .setMIFlags(Flags).getReg(0);
2045   } else
2046     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2047 
2048   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2049     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2050   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2051     .addUse(TrigVal)
2052     .setMIFlags(Flags);
2053   MI.eraseFromParent();
2054   return true;
2055 }
2056 
2057 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2058                                                   MachineIRBuilder &B,
2059                                                   const GlobalValue *GV,
2060                                                   int64_t Offset,
2061                                                   unsigned GAFlags) const {
2062   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2063   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2064   // to the following code sequence:
2065   //
2066   // For constant address space:
2067   //   s_getpc_b64 s[0:1]
2068   //   s_add_u32 s0, s0, $symbol
2069   //   s_addc_u32 s1, s1, 0
2070   //
2071   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2072   //   a fixup or relocation is emitted to replace $symbol with a literal
2073   //   constant, which is a pc-relative offset from the encoding of the $symbol
2074   //   operand to the global variable.
2075   //
2076   // For global address space:
2077   //   s_getpc_b64 s[0:1]
2078   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2079   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2080   //
2081   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2082   //   fixups or relocations are emitted to replace $symbol@*@lo and
2083   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2084   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2085   //   operand to the global variable.
2086   //
2087   // What we want here is an offset from the value returned by s_getpc
2088   // (which is the address of the s_add_u32 instruction) to the global
2089   // variable, but since the encoding of $symbol starts 4 bytes after the start
2090   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2091   // small. This requires us to add 4 to the global variable offset in order to
2092   // compute the correct address.
2093 
2094   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2095 
2096   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2097     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2098 
2099   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2100     .addDef(PCReg);
2101 
2102   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2103   if (GAFlags == SIInstrInfo::MO_NONE)
2104     MIB.addImm(0);
2105   else
2106     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2107 
2108   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2109 
2110   if (PtrTy.getSizeInBits() == 32)
2111     B.buildExtract(DstReg, PCReg, 0);
2112   return true;
2113  }
2114 
2115 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2116   MachineInstr &MI, MachineRegisterInfo &MRI,
2117   MachineIRBuilder &B) const {
2118   Register DstReg = MI.getOperand(0).getReg();
2119   LLT Ty = MRI.getType(DstReg);
2120   unsigned AS = Ty.getAddressSpace();
2121 
2122   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2123   MachineFunction &MF = B.getMF();
2124   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2125 
2126   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2127     if (!MFI->isEntryFunction()) {
2128       const Function &Fn = MF.getFunction();
2129       DiagnosticInfoUnsupported BadLDSDecl(
2130         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2131         DS_Warning);
2132       Fn.getContext().diagnose(BadLDSDecl);
2133 
2134       // We currently don't have a way to correctly allocate LDS objects that
2135       // aren't directly associated with a kernel. We do force inlining of
2136       // functions that use local objects. However, if these dead functions are
2137       // not eliminated, we don't want a compile time error. Just emit a warning
2138       // and a trap, since there should be no callable path here.
2139       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2140       B.buildUndef(DstReg);
2141       MI.eraseFromParent();
2142       return true;
2143     }
2144 
2145     // TODO: We could emit code to handle the initialization somewhere.
2146     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2147       const SITargetLowering *TLI = ST.getTargetLowering();
2148       if (!TLI->shouldUseLDSConstAddress(GV)) {
2149         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2150         return true; // Leave in place;
2151       }
2152 
2153       B.buildConstant(
2154           DstReg,
2155           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2156       MI.eraseFromParent();
2157       return true;
2158     }
2159 
2160     const Function &Fn = MF.getFunction();
2161     DiagnosticInfoUnsupported BadInit(
2162       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2163     Fn.getContext().diagnose(BadInit);
2164     return true;
2165   }
2166 
2167   const SITargetLowering *TLI = ST.getTargetLowering();
2168 
2169   if (TLI->shouldEmitFixup(GV)) {
2170     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2171     MI.eraseFromParent();
2172     return true;
2173   }
2174 
2175   if (TLI->shouldEmitPCReloc(GV)) {
2176     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2177     MI.eraseFromParent();
2178     return true;
2179   }
2180 
2181   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2182   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2183 
2184   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2185       MachinePointerInfo::getGOT(MF),
2186       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2187           MachineMemOperand::MOInvariant,
2188       8 /*Size*/, Align(8));
2189 
2190   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2191 
2192   if (Ty.getSizeInBits() == 32) {
2193     // Truncate if this is a 32-bit constant adrdess.
2194     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2195     B.buildExtract(DstReg, Load, 0);
2196   } else
2197     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2198 
2199   MI.eraseFromParent();
2200   return true;
2201 }
2202 
2203 bool AMDGPULegalizerInfo::legalizeLoad(
2204   MachineInstr &MI, MachineRegisterInfo &MRI,
2205   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2206   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2207   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2208   Observer.changingInstr(MI);
2209   MI.getOperand(1).setReg(Cast.getReg(0));
2210   Observer.changedInstr(MI);
2211   return true;
2212 }
2213 
2214 bool AMDGPULegalizerInfo::legalizeFMad(
2215   MachineInstr &MI, MachineRegisterInfo &MRI,
2216   MachineIRBuilder &B) const {
2217   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2218   assert(Ty.isScalar());
2219 
2220   MachineFunction &MF = B.getMF();
2221   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2222 
2223   // TODO: Always legal with future ftz flag.
2224   // FIXME: Do we need just output?
2225   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2226     return true;
2227   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2228     return true;
2229 
2230   MachineIRBuilder HelperBuilder(MI);
2231   GISelObserverWrapper DummyObserver;
2232   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2233   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2234 }
2235 
2236 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2237   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2238   Register DstReg = MI.getOperand(0).getReg();
2239   Register PtrReg = MI.getOperand(1).getReg();
2240   Register CmpVal = MI.getOperand(2).getReg();
2241   Register NewVal = MI.getOperand(3).getReg();
2242 
2243   assert(SITargetLowering::isFlatGlobalAddrSpace(
2244            MRI.getType(PtrReg).getAddressSpace()) &&
2245          "this should not have been custom lowered");
2246 
2247   LLT ValTy = MRI.getType(CmpVal);
2248   LLT VecTy = LLT::vector(2, ValTy);
2249 
2250   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2251 
2252   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2253     .addDef(DstReg)
2254     .addUse(PtrReg)
2255     .addUse(PackedVal)
2256     .setMemRefs(MI.memoperands());
2257 
2258   MI.eraseFromParent();
2259   return true;
2260 }
2261 
2262 bool AMDGPULegalizerInfo::legalizeFlog(
2263   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2264   Register Dst = MI.getOperand(0).getReg();
2265   Register Src = MI.getOperand(1).getReg();
2266   LLT Ty = B.getMRI()->getType(Dst);
2267   unsigned Flags = MI.getFlags();
2268 
2269   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2270   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2271 
2272   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2273   MI.eraseFromParent();
2274   return true;
2275 }
2276 
2277 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2278                                        MachineIRBuilder &B) const {
2279   Register Dst = MI.getOperand(0).getReg();
2280   Register Src = MI.getOperand(1).getReg();
2281   unsigned Flags = MI.getFlags();
2282   LLT Ty = B.getMRI()->getType(Dst);
2283 
2284   auto K = B.buildFConstant(Ty, numbers::log2e);
2285   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2286   B.buildFExp2(Dst, Mul, Flags);
2287   MI.eraseFromParent();
2288   return true;
2289 }
2290 
2291 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2292                                        MachineIRBuilder &B) const {
2293   Register Dst = MI.getOperand(0).getReg();
2294   Register Src0 = MI.getOperand(1).getReg();
2295   Register Src1 = MI.getOperand(2).getReg();
2296   unsigned Flags = MI.getFlags();
2297   LLT Ty = B.getMRI()->getType(Dst);
2298   const LLT S16 = LLT::scalar(16);
2299   const LLT S32 = LLT::scalar(32);
2300 
2301   if (Ty == S32) {
2302     auto Log = B.buildFLog2(S32, Src0, Flags);
2303     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2304       .addUse(Log.getReg(0))
2305       .addUse(Src1)
2306       .setMIFlags(Flags);
2307     B.buildFExp2(Dst, Mul, Flags);
2308   } else if (Ty == S16) {
2309     // There's no f16 fmul_legacy, so we need to convert for it.
2310     auto Log = B.buildFLog2(S16, Src0, Flags);
2311     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2312     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2313     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2314       .addUse(Ext0.getReg(0))
2315       .addUse(Ext1.getReg(0))
2316       .setMIFlags(Flags);
2317 
2318     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2319   } else
2320     return false;
2321 
2322   MI.eraseFromParent();
2323   return true;
2324 }
2325 
2326 // Find a source register, ignoring any possible source modifiers.
2327 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2328   Register ModSrc = OrigSrc;
2329   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2330     ModSrc = SrcFNeg->getOperand(1).getReg();
2331     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2332       ModSrc = SrcFAbs->getOperand(1).getReg();
2333   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2334     ModSrc = SrcFAbs->getOperand(1).getReg();
2335   return ModSrc;
2336 }
2337 
2338 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2339                                          MachineRegisterInfo &MRI,
2340                                          MachineIRBuilder &B) const {
2341 
2342   const LLT S1 = LLT::scalar(1);
2343   const LLT S64 = LLT::scalar(64);
2344   Register Dst = MI.getOperand(0).getReg();
2345   Register OrigSrc = MI.getOperand(1).getReg();
2346   unsigned Flags = MI.getFlags();
2347   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2348          "this should not have been custom lowered");
2349 
2350   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2351   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2352   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2353   // V_FRACT bug is:
2354   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2355   //
2356   // Convert floor(x) to (x - fract(x))
2357 
2358   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2359     .addUse(OrigSrc)
2360     .setMIFlags(Flags);
2361 
2362   // Give source modifier matching some assistance before obscuring a foldable
2363   // pattern.
2364 
2365   // TODO: We can avoid the neg on the fract? The input sign to fract
2366   // shouldn't matter?
2367   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2368 
2369   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2370 
2371   Register Min = MRI.createGenericVirtualRegister(S64);
2372 
2373   // We don't need to concern ourselves with the snan handling difference, so
2374   // use the one which will directly select.
2375   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2376   if (MFI->getMode().IEEE)
2377     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2378   else
2379     B.buildFMinNum(Min, Fract, Const, Flags);
2380 
2381   Register CorrectedFract = Min;
2382   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2383     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2384     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2385   }
2386 
2387   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2388   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2389 
2390   MI.eraseFromParent();
2391   return true;
2392 }
2393 
2394 // Turn an illegal packed v2s16 build vector into bit operations.
2395 // TODO: This should probably be a bitcast action in LegalizerHelper.
2396 bool AMDGPULegalizerInfo::legalizeBuildVector(
2397   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2398   Register Dst = MI.getOperand(0).getReg();
2399   const LLT S32 = LLT::scalar(32);
2400   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2401 
2402   Register Src0 = MI.getOperand(1).getReg();
2403   Register Src1 = MI.getOperand(2).getReg();
2404   assert(MRI.getType(Src0) == LLT::scalar(16));
2405 
2406   auto Merge = B.buildMerge(S32, {Src0, Src1});
2407   B.buildBitcast(Dst, Merge);
2408 
2409   MI.eraseFromParent();
2410   return true;
2411 }
2412 
2413 // Return the use branch instruction, otherwise null if the usage is invalid.
2414 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2415                                        MachineRegisterInfo &MRI,
2416                                        MachineInstr *&Br,
2417                                        MachineBasicBlock *&UncondBrTarget) {
2418   Register CondDef = MI.getOperand(0).getReg();
2419   if (!MRI.hasOneNonDBGUse(CondDef))
2420     return nullptr;
2421 
2422   MachineBasicBlock *Parent = MI.getParent();
2423   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2424   if (UseMI.getParent() != Parent ||
2425       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2426     return nullptr;
2427 
2428   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2429   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2430   if (Next == Parent->end()) {
2431     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2432     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2433       return nullptr;
2434     UncondBrTarget = &*NextMBB;
2435   } else {
2436     if (Next->getOpcode() != AMDGPU::G_BR)
2437       return nullptr;
2438     Br = &*Next;
2439     UncondBrTarget = Br->getOperand(0).getMBB();
2440   }
2441 
2442   return &UseMI;
2443 }
2444 
2445 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2446                                                MachineRegisterInfo &MRI,
2447                                                Register LiveIn,
2448                                                Register PhyReg) const {
2449   assert(PhyReg.isPhysical() && "Physical register expected");
2450 
2451   // Insert the live-in copy, if required, by defining destination virtual
2452   // register.
2453   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2454   if (!MRI.getVRegDef(LiveIn)) {
2455     // FIXME: Should have scoped insert pt
2456     MachineBasicBlock &OrigInsBB = B.getMBB();
2457     auto OrigInsPt = B.getInsertPt();
2458 
2459     MachineBasicBlock &EntryMBB = B.getMF().front();
2460     EntryMBB.addLiveIn(PhyReg);
2461     B.setInsertPt(EntryMBB, EntryMBB.begin());
2462     B.buildCopy(LiveIn, PhyReg);
2463 
2464     B.setInsertPt(OrigInsBB, OrigInsPt);
2465   }
2466 
2467   return LiveIn;
2468 }
2469 
2470 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2471                                                 MachineRegisterInfo &MRI,
2472                                                 Register PhyReg, LLT Ty,
2473                                                 bool InsertLiveInCopy) const {
2474   assert(PhyReg.isPhysical() && "Physical register expected");
2475 
2476   // Get or create virtual live-in regester
2477   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2478   if (!LiveIn) {
2479     LiveIn = MRI.createGenericVirtualRegister(Ty);
2480     MRI.addLiveIn(PhyReg, LiveIn);
2481   }
2482 
2483   // When the actual true copy required is from virtual register to physical
2484   // register (to be inserted later), live-in copy insertion from physical
2485   // to register virtual register is not required
2486   if (!InsertLiveInCopy)
2487     return LiveIn;
2488 
2489   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2490 }
2491 
2492 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2493     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2494   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2495   const ArgDescriptor *Arg;
2496   const TargetRegisterClass *RC;
2497   LLT ArgTy;
2498   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2499   if (!Arg) {
2500     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2501     return nullptr;
2502   }
2503   return Arg;
2504 }
2505 
2506 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2507                                          const ArgDescriptor *Arg) const {
2508   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2509     return false; // TODO: Handle these
2510 
2511   Register SrcReg = Arg->getRegister();
2512   assert(SrcReg.isPhysical() && "Physical register expected");
2513   assert(DstReg.isVirtual() && "Virtual register expected");
2514 
2515   MachineRegisterInfo &MRI = *B.getMRI();
2516 
2517   LLT Ty = MRI.getType(DstReg);
2518   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2519 
2520   if (Arg->isMasked()) {
2521     // TODO: Should we try to emit this once in the entry block?
2522     const LLT S32 = LLT::scalar(32);
2523     const unsigned Mask = Arg->getMask();
2524     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2525 
2526     Register AndMaskSrc = LiveIn;
2527 
2528     if (Shift != 0) {
2529       auto ShiftAmt = B.buildConstant(S32, Shift);
2530       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2531     }
2532 
2533     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2534   } else {
2535     B.buildCopy(DstReg, LiveIn);
2536   }
2537 
2538   return true;
2539 }
2540 
2541 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2542     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2543     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2544 
2545   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2546   if (!Arg)
2547     return false;
2548 
2549   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2550     return false;
2551 
2552   MI.eraseFromParent();
2553   return true;
2554 }
2555 
2556 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2557                                        MachineRegisterInfo &MRI,
2558                                        MachineIRBuilder &B) const {
2559   Register Dst = MI.getOperand(0).getReg();
2560   LLT DstTy = MRI.getType(Dst);
2561   LLT S16 = LLT::scalar(16);
2562   LLT S32 = LLT::scalar(32);
2563   LLT S64 = LLT::scalar(64);
2564 
2565   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2566     return true;
2567 
2568   if (DstTy == S16)
2569     return legalizeFDIV16(MI, MRI, B);
2570   if (DstTy == S32)
2571     return legalizeFDIV32(MI, MRI, B);
2572   if (DstTy == S64)
2573     return legalizeFDIV64(MI, MRI, B);
2574 
2575   return false;
2576 }
2577 
2578 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2579                                                   Register DstReg,
2580                                                   Register X,
2581                                                   Register Y,
2582                                                   bool IsDiv) const {
2583   const LLT S1 = LLT::scalar(1);
2584   const LLT S32 = LLT::scalar(32);
2585 
2586   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2587   // algorithm used here.
2588 
2589   // Initial estimate of inv(y).
2590   auto FloatY = B.buildUITOFP(S32, Y);
2591   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2592   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2593   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2594   auto Z = B.buildFPTOUI(S32, ScaledY);
2595 
2596   // One round of UNR.
2597   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2598   auto NegYZ = B.buildMul(S32, NegY, Z);
2599   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2600 
2601   // Quotient/remainder estimate.
2602   auto Q = B.buildUMulH(S32, X, Z);
2603   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2604 
2605   // First quotient/remainder refinement.
2606   auto One = B.buildConstant(S32, 1);
2607   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2608   if (IsDiv)
2609     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2610   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2611 
2612   // Second quotient/remainder refinement.
2613   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2614   if (IsDiv)
2615     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2616   else
2617     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2618 }
2619 
2620 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2621                                               MachineRegisterInfo &MRI,
2622                                               MachineIRBuilder &B) const {
2623   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2624   Register DstReg = MI.getOperand(0).getReg();
2625   Register Num = MI.getOperand(1).getReg();
2626   Register Den = MI.getOperand(2).getReg();
2627   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2628   MI.eraseFromParent();
2629   return true;
2630 }
2631 
2632 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2633 //
2634 // Return lo, hi of result
2635 //
2636 // %cvt.lo = G_UITOFP Val.lo
2637 // %cvt.hi = G_UITOFP Val.hi
2638 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2639 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2640 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2641 // %mul2 = G_FMUL %mul1, 2**(-32)
2642 // %trunc = G_INTRINSIC_TRUNC %mul2
2643 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2644 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2645 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2646                                                        Register Val) {
2647   const LLT S32 = LLT::scalar(32);
2648   auto Unmerge = B.buildUnmerge(S32, Val);
2649 
2650   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2651   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2652 
2653   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2654                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2655 
2656   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2657   auto Mul1 =
2658       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2659 
2660   // 2**(-32)
2661   auto Mul2 =
2662       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2663   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2664 
2665   // -(2**32)
2666   auto Mad2 = B.buildFMAD(S32, Trunc,
2667                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2668 
2669   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2670   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2671 
2672   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2673 }
2674 
2675 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2676                                                   Register DstReg,
2677                                                   Register Numer,
2678                                                   Register Denom,
2679                                                   bool IsDiv) const {
2680   const LLT S32 = LLT::scalar(32);
2681   const LLT S64 = LLT::scalar(64);
2682   const LLT S1 = LLT::scalar(1);
2683   Register RcpLo, RcpHi;
2684 
2685   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2686 
2687   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2688 
2689   auto Zero64 = B.buildConstant(S64, 0);
2690   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2691 
2692   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2693   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2694 
2695   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2696   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2697   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2698 
2699   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2700   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2701   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2702   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2703 
2704   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2705   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2706   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2707   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2708   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2709 
2710   auto Zero32 = B.buildConstant(S32, 0);
2711   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2712   auto Add2_HiC =
2713       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2714   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2715   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2716 
2717   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2718   Register NumerLo = UnmergeNumer.getReg(0);
2719   Register NumerHi = UnmergeNumer.getReg(1);
2720 
2721   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2722   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2723   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2724   Register Mul3_Lo = UnmergeMul3.getReg(0);
2725   Register Mul3_Hi = UnmergeMul3.getReg(1);
2726   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2727   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2728   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2729   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2730 
2731   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2732   Register DenomLo = UnmergeDenom.getReg(0);
2733   Register DenomHi = UnmergeDenom.getReg(1);
2734 
2735   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2736   auto C1 = B.buildSExt(S32, CmpHi);
2737 
2738   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2739   auto C2 = B.buildSExt(S32, CmpLo);
2740 
2741   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2742   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2743 
2744   // TODO: Here and below portions of the code can be enclosed into if/endif.
2745   // Currently control flow is unconditional and we have 4 selects after
2746   // potential endif to substitute PHIs.
2747 
2748   // if C3 != 0 ...
2749   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2750   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2751   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2752   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2753 
2754   auto One64 = B.buildConstant(S64, 1);
2755   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2756 
2757   auto C4 =
2758       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2759   auto C5 =
2760       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2761   auto C6 = B.buildSelect(
2762       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2763 
2764   // if (C6 != 0)
2765   auto Add4 = B.buildAdd(S64, Add3, One64);
2766   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2767 
2768   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2769   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2770   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2771 
2772   // endif C6
2773   // endif C3
2774 
2775   if (IsDiv) {
2776     auto Sel1 = B.buildSelect(
2777         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2778     B.buildSelect(DstReg,
2779                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2780   } else {
2781     auto Sel2 = B.buildSelect(
2782         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2783     B.buildSelect(DstReg,
2784                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2785   }
2786 }
2787 
2788 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2789                                             MachineRegisterInfo &MRI,
2790                                             MachineIRBuilder &B) const {
2791   const LLT S64 = LLT::scalar(64);
2792   const LLT S32 = LLT::scalar(32);
2793   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2794   Register DstReg = MI.getOperand(0).getReg();
2795   Register Num = MI.getOperand(1).getReg();
2796   Register Den = MI.getOperand(2).getReg();
2797   LLT Ty = MRI.getType(DstReg);
2798 
2799   if (Ty == S32)
2800     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2801   else if (Ty == S64)
2802     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2803   else
2804     return false;
2805 
2806   MI.eraseFromParent();
2807   return true;
2808 
2809 }
2810 
2811 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2812                                             MachineRegisterInfo &MRI,
2813                                             MachineIRBuilder &B) const {
2814   const LLT S64 = LLT::scalar(64);
2815   const LLT S32 = LLT::scalar(32);
2816 
2817   Register DstReg = MI.getOperand(0).getReg();
2818   const LLT Ty = MRI.getType(DstReg);
2819   if (Ty != S32 && Ty != S64)
2820     return false;
2821 
2822   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2823 
2824   Register LHS = MI.getOperand(1).getReg();
2825   Register RHS = MI.getOperand(2).getReg();
2826 
2827   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2828   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2829   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2830 
2831   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2832   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2833 
2834   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2835   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2836 
2837   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2838   if (Ty == S32)
2839     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2840   else
2841     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2842 
2843   Register Sign;
2844   if (IsDiv)
2845     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2846   else
2847     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2848 
2849   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2850   B.buildSub(DstReg, UDivRem, Sign);
2851 
2852   MI.eraseFromParent();
2853   return true;
2854 }
2855 
2856 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2857                                                  MachineRegisterInfo &MRI,
2858                                                  MachineIRBuilder &B) const {
2859   Register Res = MI.getOperand(0).getReg();
2860   Register LHS = MI.getOperand(1).getReg();
2861   Register RHS = MI.getOperand(2).getReg();
2862 
2863   uint16_t Flags = MI.getFlags();
2864 
2865   LLT ResTy = MRI.getType(Res);
2866   LLT S32 = LLT::scalar(32);
2867   LLT S64 = LLT::scalar(64);
2868 
2869   const MachineFunction &MF = B.getMF();
2870   bool Unsafe =
2871     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2872 
2873   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2874     return false;
2875 
2876   if (!Unsafe && ResTy == S32 &&
2877       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2878     return false;
2879 
2880   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2881     // 1 / x -> RCP(x)
2882     if (CLHS->isExactlyValue(1.0)) {
2883       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2884         .addUse(RHS)
2885         .setMIFlags(Flags);
2886 
2887       MI.eraseFromParent();
2888       return true;
2889     }
2890 
2891     // -1 / x -> RCP( FNEG(x) )
2892     if (CLHS->isExactlyValue(-1.0)) {
2893       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2894       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2895         .addUse(FNeg.getReg(0))
2896         .setMIFlags(Flags);
2897 
2898       MI.eraseFromParent();
2899       return true;
2900     }
2901   }
2902 
2903   // x / y -> x * (1.0 / y)
2904   if (Unsafe) {
2905     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2906       .addUse(RHS)
2907       .setMIFlags(Flags);
2908     B.buildFMul(Res, LHS, RCP, Flags);
2909 
2910     MI.eraseFromParent();
2911     return true;
2912   }
2913 
2914   return false;
2915 }
2916 
2917 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2918                                          MachineRegisterInfo &MRI,
2919                                          MachineIRBuilder &B) const {
2920   Register Res = MI.getOperand(0).getReg();
2921   Register LHS = MI.getOperand(1).getReg();
2922   Register RHS = MI.getOperand(2).getReg();
2923 
2924   uint16_t Flags = MI.getFlags();
2925 
2926   LLT S16 = LLT::scalar(16);
2927   LLT S32 = LLT::scalar(32);
2928 
2929   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2930   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2931 
2932   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2933     .addUse(RHSExt.getReg(0))
2934     .setMIFlags(Flags);
2935 
2936   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2937   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2938 
2939   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2940     .addUse(RDst.getReg(0))
2941     .addUse(RHS)
2942     .addUse(LHS)
2943     .setMIFlags(Flags);
2944 
2945   MI.eraseFromParent();
2946   return true;
2947 }
2948 
2949 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2950 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2951 static void toggleSPDenormMode(bool Enable,
2952                                MachineIRBuilder &B,
2953                                const GCNSubtarget &ST,
2954                                AMDGPU::SIModeRegisterDefaults Mode) {
2955   // Set SP denorm mode to this value.
2956   unsigned SPDenormMode =
2957     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2958 
2959   if (ST.hasDenormModeInst()) {
2960     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2961     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2962 
2963     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2964     B.buildInstr(AMDGPU::S_DENORM_MODE)
2965       .addImm(NewDenormModeValue);
2966 
2967   } else {
2968     // Select FP32 bit field in mode register.
2969     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2970                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2971                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2972 
2973     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2974       .addImm(SPDenormMode)
2975       .addImm(SPDenormModeBitField);
2976   }
2977 }
2978 
2979 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2980                                          MachineRegisterInfo &MRI,
2981                                          MachineIRBuilder &B) const {
2982   Register Res = MI.getOperand(0).getReg();
2983   Register LHS = MI.getOperand(1).getReg();
2984   Register RHS = MI.getOperand(2).getReg();
2985   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2986   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2987 
2988   uint16_t Flags = MI.getFlags();
2989 
2990   LLT S32 = LLT::scalar(32);
2991   LLT S1 = LLT::scalar(1);
2992 
2993   auto One = B.buildFConstant(S32, 1.0f);
2994 
2995   auto DenominatorScaled =
2996     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2997       .addUse(LHS)
2998       .addUse(RHS)
2999       .addImm(0)
3000       .setMIFlags(Flags);
3001   auto NumeratorScaled =
3002     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3003       .addUse(LHS)
3004       .addUse(RHS)
3005       .addImm(1)
3006       .setMIFlags(Flags);
3007 
3008   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3009     .addUse(DenominatorScaled.getReg(0))
3010     .setMIFlags(Flags);
3011   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3012 
3013   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3014   // aren't modeled as reading it.
3015   if (!Mode.allFP32Denormals())
3016     toggleSPDenormMode(true, B, ST, Mode);
3017 
3018   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3019   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3020   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3021   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3022   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3023   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3024 
3025   if (!Mode.allFP32Denormals())
3026     toggleSPDenormMode(false, B, ST, Mode);
3027 
3028   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3029     .addUse(Fma4.getReg(0))
3030     .addUse(Fma1.getReg(0))
3031     .addUse(Fma3.getReg(0))
3032     .addUse(NumeratorScaled.getReg(1))
3033     .setMIFlags(Flags);
3034 
3035   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3036     .addUse(Fmas.getReg(0))
3037     .addUse(RHS)
3038     .addUse(LHS)
3039     .setMIFlags(Flags);
3040 
3041   MI.eraseFromParent();
3042   return true;
3043 }
3044 
3045 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3046                                          MachineRegisterInfo &MRI,
3047                                          MachineIRBuilder &B) const {
3048   Register Res = MI.getOperand(0).getReg();
3049   Register LHS = MI.getOperand(1).getReg();
3050   Register RHS = MI.getOperand(2).getReg();
3051 
3052   uint16_t Flags = MI.getFlags();
3053 
3054   LLT S64 = LLT::scalar(64);
3055   LLT S1 = LLT::scalar(1);
3056 
3057   auto One = B.buildFConstant(S64, 1.0);
3058 
3059   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3060     .addUse(LHS)
3061     .addUse(RHS)
3062     .addImm(0)
3063     .setMIFlags(Flags);
3064 
3065   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3066 
3067   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3068     .addUse(DivScale0.getReg(0))
3069     .setMIFlags(Flags);
3070 
3071   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3072   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3073   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3074 
3075   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3076     .addUse(LHS)
3077     .addUse(RHS)
3078     .addImm(1)
3079     .setMIFlags(Flags);
3080 
3081   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3082   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3083   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3084 
3085   Register Scale;
3086   if (!ST.hasUsableDivScaleConditionOutput()) {
3087     // Workaround a hardware bug on SI where the condition output from div_scale
3088     // is not usable.
3089 
3090     LLT S32 = LLT::scalar(32);
3091 
3092     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3093     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3094     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3095     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3096 
3097     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3098                               Scale1Unmerge.getReg(1));
3099     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3100                               Scale0Unmerge.getReg(1));
3101     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3102   } else {
3103     Scale = DivScale1.getReg(1);
3104   }
3105 
3106   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3107     .addUse(Fma4.getReg(0))
3108     .addUse(Fma3.getReg(0))
3109     .addUse(Mul.getReg(0))
3110     .addUse(Scale)
3111     .setMIFlags(Flags);
3112 
3113   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3114     .addUse(Fmas.getReg(0))
3115     .addUse(RHS)
3116     .addUse(LHS)
3117     .setMIFlags(Flags);
3118 
3119   MI.eraseFromParent();
3120   return true;
3121 }
3122 
3123 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3124                                                  MachineRegisterInfo &MRI,
3125                                                  MachineIRBuilder &B) const {
3126   Register Res = MI.getOperand(0).getReg();
3127   Register LHS = MI.getOperand(2).getReg();
3128   Register RHS = MI.getOperand(3).getReg();
3129   uint16_t Flags = MI.getFlags();
3130 
3131   LLT S32 = LLT::scalar(32);
3132   LLT S1 = LLT::scalar(1);
3133 
3134   auto Abs = B.buildFAbs(S32, RHS, Flags);
3135   const APFloat C0Val(1.0f);
3136 
3137   auto C0 = B.buildConstant(S32, 0x6f800000);
3138   auto C1 = B.buildConstant(S32, 0x2f800000);
3139   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3140 
3141   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3142   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3143 
3144   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3145 
3146   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3147     .addUse(Mul0.getReg(0))
3148     .setMIFlags(Flags);
3149 
3150   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3151 
3152   B.buildFMul(Res, Sel, Mul1, Flags);
3153 
3154   MI.eraseFromParent();
3155   return true;
3156 }
3157 
3158 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3159                                             MachineRegisterInfo &MRI,
3160                                             MachineIRBuilder &B) const {
3161   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3162   uint64_t Offset =
3163     ST.getTargetLowering()->getImplicitParameterOffset(
3164       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3165   LLT DstTy = MRI.getType(DstReg);
3166   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3167 
3168   const ArgDescriptor *Arg;
3169   const TargetRegisterClass *RC;
3170   LLT ArgTy;
3171   std::tie(Arg, RC, ArgTy) =
3172       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3173   if (!Arg)
3174     return false;
3175 
3176   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3177   if (!loadInputValue(KernargPtrReg, B, Arg))
3178     return false;
3179 
3180   // FIXME: This should be nuw
3181   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3182   return true;
3183 }
3184 
3185 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3186                                                  MachineRegisterInfo &MRI,
3187                                                  MachineIRBuilder &B) const {
3188   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3189   if (!MFI->isEntryFunction()) {
3190     return legalizePreloadedArgIntrin(MI, MRI, B,
3191                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3192   }
3193 
3194   Register DstReg = MI.getOperand(0).getReg();
3195   if (!getImplicitArgPtr(DstReg, MRI, B))
3196     return false;
3197 
3198   MI.eraseFromParent();
3199   return true;
3200 }
3201 
3202 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3203                                               MachineRegisterInfo &MRI,
3204                                               MachineIRBuilder &B,
3205                                               unsigned AddrSpace) const {
3206   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3207   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3208   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3209   MI.eraseFromParent();
3210   return true;
3211 }
3212 
3213 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3214 // offset (the offset that is included in bounds checking and swizzling, to be
3215 // split between the instruction's voffset and immoffset fields) and soffset
3216 // (the offset that is excluded from bounds checking and swizzling, to go in
3217 // the instruction's soffset field).  This function takes the first kind of
3218 // offset and figures out how to split it between voffset and immoffset.
3219 std::tuple<Register, unsigned, unsigned>
3220 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3221                                         Register OrigOffset) const {
3222   const unsigned MaxImm = 4095;
3223   Register BaseReg;
3224   unsigned TotalConstOffset;
3225   MachineInstr *OffsetDef;
3226   const LLT S32 = LLT::scalar(32);
3227 
3228   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3229     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3230 
3231   unsigned ImmOffset = TotalConstOffset;
3232 
3233   // If the immediate value is too big for the immoffset field, put the value
3234   // and -4096 into the immoffset field so that the value that is copied/added
3235   // for the voffset field is a multiple of 4096, and it stands more chance
3236   // of being CSEd with the copy/add for another similar load/store.
3237   // However, do not do that rounding down to a multiple of 4096 if that is a
3238   // negative number, as it appears to be illegal to have a negative offset
3239   // in the vgpr, even if adding the immediate offset makes it positive.
3240   unsigned Overflow = ImmOffset & ~MaxImm;
3241   ImmOffset -= Overflow;
3242   if ((int32_t)Overflow < 0) {
3243     Overflow += ImmOffset;
3244     ImmOffset = 0;
3245   }
3246 
3247   if (Overflow != 0) {
3248     if (!BaseReg) {
3249       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3250     } else {
3251       auto OverflowVal = B.buildConstant(S32, Overflow);
3252       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3253     }
3254   }
3255 
3256   if (!BaseReg)
3257     BaseReg = B.buildConstant(S32, 0).getReg(0);
3258 
3259   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3260 }
3261 
3262 /// Handle register layout difference for f16 images for some subtargets.
3263 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3264                                              MachineRegisterInfo &MRI,
3265                                              Register Reg) const {
3266   if (!ST.hasUnpackedD16VMem())
3267     return Reg;
3268 
3269   const LLT S16 = LLT::scalar(16);
3270   const LLT S32 = LLT::scalar(32);
3271   LLT StoreVT = MRI.getType(Reg);
3272   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3273 
3274   auto Unmerge = B.buildUnmerge(S16, Reg);
3275 
3276   SmallVector<Register, 4> WideRegs;
3277   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3278     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3279 
3280   int NumElts = StoreVT.getNumElements();
3281 
3282   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3283 }
3284 
3285 Register AMDGPULegalizerInfo::fixStoreSourceType(
3286   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3287   MachineRegisterInfo *MRI = B.getMRI();
3288   LLT Ty = MRI->getType(VData);
3289 
3290   const LLT S16 = LLT::scalar(16);
3291 
3292   // Fixup illegal register types for i8 stores.
3293   if (Ty == LLT::scalar(8) || Ty == S16) {
3294     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3295     return AnyExt;
3296   }
3297 
3298   if (Ty.isVector()) {
3299     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3300       if (IsFormat)
3301         return handleD16VData(B, *MRI, VData);
3302     }
3303   }
3304 
3305   return VData;
3306 }
3307 
3308 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3309                                               MachineRegisterInfo &MRI,
3310                                               MachineIRBuilder &B,
3311                                               bool IsTyped,
3312                                               bool IsFormat) const {
3313   Register VData = MI.getOperand(1).getReg();
3314   LLT Ty = MRI.getType(VData);
3315   LLT EltTy = Ty.getScalarType();
3316   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3317   const LLT S32 = LLT::scalar(32);
3318 
3319   VData = fixStoreSourceType(B, VData, IsFormat);
3320   Register RSrc = MI.getOperand(2).getReg();
3321 
3322   MachineMemOperand *MMO = *MI.memoperands_begin();
3323   const int MemSize = MMO->getSize();
3324 
3325   unsigned ImmOffset;
3326   unsigned TotalOffset;
3327 
3328   // The typed intrinsics add an immediate after the registers.
3329   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3330 
3331   // The struct intrinsic variants add one additional operand over raw.
3332   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3333   Register VIndex;
3334   int OpOffset = 0;
3335   if (HasVIndex) {
3336     VIndex = MI.getOperand(3).getReg();
3337     OpOffset = 1;
3338   }
3339 
3340   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3341   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3342 
3343   unsigned Format = 0;
3344   if (IsTyped) {
3345     Format = MI.getOperand(5 + OpOffset).getImm();
3346     ++OpOffset;
3347   }
3348 
3349   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3350 
3351   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3352   if (TotalOffset != 0)
3353     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3354 
3355   unsigned Opc;
3356   if (IsTyped) {
3357     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3358                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3359   } else if (IsFormat) {
3360     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3361                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3362   } else {
3363     switch (MemSize) {
3364     case 1:
3365       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3366       break;
3367     case 2:
3368       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3369       break;
3370     default:
3371       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3372       break;
3373     }
3374   }
3375 
3376   if (!VIndex)
3377     VIndex = B.buildConstant(S32, 0).getReg(0);
3378 
3379   auto MIB = B.buildInstr(Opc)
3380     .addUse(VData)              // vdata
3381     .addUse(RSrc)               // rsrc
3382     .addUse(VIndex)             // vindex
3383     .addUse(VOffset)            // voffset
3384     .addUse(SOffset)            // soffset
3385     .addImm(ImmOffset);         // offset(imm)
3386 
3387   if (IsTyped)
3388     MIB.addImm(Format);
3389 
3390   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3391      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3392      .addMemOperand(MMO);
3393 
3394   MI.eraseFromParent();
3395   return true;
3396 }
3397 
3398 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3399                                              MachineRegisterInfo &MRI,
3400                                              MachineIRBuilder &B,
3401                                              bool IsFormat,
3402                                              bool IsTyped) const {
3403   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3404   MachineMemOperand *MMO = *MI.memoperands_begin();
3405   const int MemSize = MMO->getSize();
3406   const LLT S32 = LLT::scalar(32);
3407 
3408   Register Dst = MI.getOperand(0).getReg();
3409   Register RSrc = MI.getOperand(2).getReg();
3410 
3411   // The typed intrinsics add an immediate after the registers.
3412   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3413 
3414   // The struct intrinsic variants add one additional operand over raw.
3415   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3416   Register VIndex;
3417   int OpOffset = 0;
3418   if (HasVIndex) {
3419     VIndex = MI.getOperand(3).getReg();
3420     OpOffset = 1;
3421   }
3422 
3423   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3424   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3425 
3426   unsigned Format = 0;
3427   if (IsTyped) {
3428     Format = MI.getOperand(5 + OpOffset).getImm();
3429     ++OpOffset;
3430   }
3431 
3432   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3433   unsigned ImmOffset;
3434   unsigned TotalOffset;
3435 
3436   LLT Ty = MRI.getType(Dst);
3437   LLT EltTy = Ty.getScalarType();
3438   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3439   const bool Unpacked = ST.hasUnpackedD16VMem();
3440 
3441   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3442   if (TotalOffset != 0)
3443     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3444 
3445   unsigned Opc;
3446 
3447   if (IsTyped) {
3448     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3449                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3450   } else if (IsFormat) {
3451     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3452                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3453   } else {
3454     switch (MemSize) {
3455     case 1:
3456       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3457       break;
3458     case 2:
3459       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3460       break;
3461     default:
3462       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3463       break;
3464     }
3465   }
3466 
3467   Register LoadDstReg;
3468 
3469   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3470   LLT UnpackedTy = Ty.changeElementSize(32);
3471 
3472   if (IsExtLoad)
3473     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3474   else if (Unpacked && IsD16 && Ty.isVector())
3475     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3476   else
3477     LoadDstReg = Dst;
3478 
3479   if (!VIndex)
3480     VIndex = B.buildConstant(S32, 0).getReg(0);
3481 
3482   auto MIB = B.buildInstr(Opc)
3483     .addDef(LoadDstReg)         // vdata
3484     .addUse(RSrc)               // rsrc
3485     .addUse(VIndex)             // vindex
3486     .addUse(VOffset)            // voffset
3487     .addUse(SOffset)            // soffset
3488     .addImm(ImmOffset);         // offset(imm)
3489 
3490   if (IsTyped)
3491     MIB.addImm(Format);
3492 
3493   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3494      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3495      .addMemOperand(MMO);
3496 
3497   if (LoadDstReg != Dst) {
3498     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3499 
3500     // Widen result for extending loads was widened.
3501     if (IsExtLoad)
3502       B.buildTrunc(Dst, LoadDstReg);
3503     else {
3504       // Repack to original 16-bit vector result
3505       // FIXME: G_TRUNC should work, but legalization currently fails
3506       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3507       SmallVector<Register, 4> Repack;
3508       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3509         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3510       B.buildMerge(Dst, Repack);
3511     }
3512   }
3513 
3514   MI.eraseFromParent();
3515   return true;
3516 }
3517 
3518 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3519                                                MachineIRBuilder &B,
3520                                                bool IsInc) const {
3521   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3522                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3523   B.buildInstr(Opc)
3524     .addDef(MI.getOperand(0).getReg())
3525     .addUse(MI.getOperand(2).getReg())
3526     .addUse(MI.getOperand(3).getReg())
3527     .cloneMemRefs(MI);
3528   MI.eraseFromParent();
3529   return true;
3530 }
3531 
3532 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3533   switch (IntrID) {
3534   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3535   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3536     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3537   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3538   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3539     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3540   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3541   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3542     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3543   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3544   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3545     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3546   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3547   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3548     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3549   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3550   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3551     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3552   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3553   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3554     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3555   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3556   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3557     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3558   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3559   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3560     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3561   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3562   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3563     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3564   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3565   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3566     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3567   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3568   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3569     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3570   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3571   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3572     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3573   default:
3574     llvm_unreachable("unhandled atomic opcode");
3575   }
3576 }
3577 
3578 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3579                                                MachineIRBuilder &B,
3580                                                Intrinsic::ID IID) const {
3581   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3582                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3583 
3584   Register Dst = MI.getOperand(0).getReg();
3585   Register VData = MI.getOperand(2).getReg();
3586 
3587   Register CmpVal;
3588   int OpOffset = 0;
3589 
3590   if (IsCmpSwap) {
3591     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3592     ++OpOffset;
3593   }
3594 
3595   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3596   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3597 
3598   // The struct intrinsic variants add one additional operand over raw.
3599   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3600   Register VIndex;
3601   if (HasVIndex) {
3602     VIndex = MI.getOperand(4 + OpOffset).getReg();
3603     ++OpOffset;
3604   }
3605 
3606   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3607   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3608   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3609 
3610   MachineMemOperand *MMO = *MI.memoperands_begin();
3611 
3612   unsigned ImmOffset;
3613   unsigned TotalOffset;
3614   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3615   if (TotalOffset != 0)
3616     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3617 
3618   if (!VIndex)
3619     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3620 
3621   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3622     .addDef(Dst)
3623     .addUse(VData); // vdata
3624 
3625   if (IsCmpSwap)
3626     MIB.addReg(CmpVal);
3627 
3628   MIB.addUse(RSrc)               // rsrc
3629      .addUse(VIndex)             // vindex
3630      .addUse(VOffset)            // voffset
3631      .addUse(SOffset)            // soffset
3632      .addImm(ImmOffset)          // offset(imm)
3633      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3634      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3635      .addMemOperand(MMO);
3636 
3637   MI.eraseFromParent();
3638   return true;
3639 }
3640 
3641 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3642 /// vector with s16 typed elements.
3643 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3644                                         SmallVectorImpl<Register> &PackedAddrs,
3645                                         int AddrIdx, int DimIdx, int EndIdx,
3646                                         int NumGradients) {
3647   const LLT S16 = LLT::scalar(16);
3648   const LLT V2S16 = LLT::vector(2, 16);
3649 
3650   for (int I = AddrIdx; I < EndIdx; ++I) {
3651     MachineOperand &SrcOp = MI.getOperand(I);
3652     if (!SrcOp.isReg())
3653       continue; // _L to _LZ may have eliminated this.
3654 
3655     Register AddrReg = SrcOp.getReg();
3656 
3657     if (I < DimIdx) {
3658       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3659       PackedAddrs.push_back(AddrReg);
3660     } else {
3661       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3662       // derivatives dx/dh and dx/dv are packed with undef.
3663       if (((I + 1) >= EndIdx) ||
3664           ((NumGradients / 2) % 2 == 1 &&
3665            (I == DimIdx + (NumGradients / 2) - 1 ||
3666             I == DimIdx + NumGradients - 1)) ||
3667           // Check for _L to _LZ optimization
3668           !MI.getOperand(I + 1).isReg()) {
3669         PackedAddrs.push_back(
3670             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3671                 .getReg(0));
3672       } else {
3673         PackedAddrs.push_back(
3674             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3675                 .getReg(0));
3676         ++I;
3677       }
3678     }
3679   }
3680 }
3681 
3682 /// Convert from separate vaddr components to a single vector address register,
3683 /// and replace the remaining operands with $noreg.
3684 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3685                                      int DimIdx, int NumVAddrs) {
3686   const LLT S32 = LLT::scalar(32);
3687 
3688   SmallVector<Register, 8> AddrRegs;
3689   for (int I = 0; I != NumVAddrs; ++I) {
3690     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3691     if (SrcOp.isReg()) {
3692       AddrRegs.push_back(SrcOp.getReg());
3693       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3694     }
3695   }
3696 
3697   int NumAddrRegs = AddrRegs.size();
3698   if (NumAddrRegs != 1) {
3699     // Round up to 8 elements for v5-v7
3700     // FIXME: Missing intermediate sized register classes and instructions.
3701     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3702       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3703       auto Undef = B.buildUndef(S32);
3704       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3705       NumAddrRegs = RoundedNumRegs;
3706     }
3707 
3708     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3709     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3710   }
3711 
3712   for (int I = 1; I != NumVAddrs; ++I) {
3713     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3714     if (SrcOp.isReg())
3715       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3716   }
3717 }
3718 
3719 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3720 ///
3721 /// Depending on the subtarget, load/store with 16-bit element data need to be
3722 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3723 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3724 /// registers.
3725 ///
3726 /// We don't want to directly select image instructions just yet, but also want
3727 /// to exposes all register repacking to the legalizer/combiners. We also don't
3728 /// want a selected instrution entering RegBankSelect. In order to avoid
3729 /// defining a multitude of intermediate image instructions, directly hack on
3730 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3731 /// now unnecessary arguments with $noreg.
3732 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3733     MachineInstr &MI, MachineIRBuilder &B,
3734     GISelChangeObserver &Observer,
3735     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3736 
3737   const int NumDefs = MI.getNumExplicitDefs();
3738   bool IsTFE = NumDefs == 2;
3739   // We are only processing the operands of d16 image operations on subtargets
3740   // that use the unpacked register layout, or need to repack the TFE result.
3741 
3742   // TODO: Do we need to guard against already legalized intrinsics?
3743   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3744     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3745 
3746   MachineRegisterInfo *MRI = B.getMRI();
3747   const LLT S32 = LLT::scalar(32);
3748   const LLT S16 = LLT::scalar(16);
3749   const LLT V2S16 = LLT::vector(2, 16);
3750 
3751   // Index of first address argument
3752   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3753 
3754   int NumVAddrs, NumGradients;
3755   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3756   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3757     getDMaskIdx(BaseOpcode, NumDefs);
3758   unsigned DMask = 0;
3759 
3760   // Check for 16 bit addresses and pack if true.
3761   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3762   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3763   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3764   const bool IsG16 = GradTy == S16;
3765   const bool IsA16 = AddrTy == S16;
3766 
3767   int DMaskLanes = 0;
3768   if (!BaseOpcode->Atomic) {
3769     DMask = MI.getOperand(DMaskIdx).getImm();
3770     if (BaseOpcode->Gather4) {
3771       DMaskLanes = 4;
3772     } else if (DMask != 0) {
3773       DMaskLanes = countPopulation(DMask);
3774     } else if (!IsTFE && !BaseOpcode->Store) {
3775       // If dmask is 0, this is a no-op load. This can be eliminated.
3776       B.buildUndef(MI.getOperand(0));
3777       MI.eraseFromParent();
3778       return true;
3779     }
3780   }
3781 
3782   Observer.changingInstr(MI);
3783   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3784 
3785   unsigned NewOpcode = NumDefs == 0 ?
3786     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3787 
3788   // Track that we legalized this
3789   MI.setDesc(B.getTII().get(NewOpcode));
3790 
3791   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3792   // dmask to be at least 1 otherwise the instruction will fail
3793   if (IsTFE && DMask == 0) {
3794     DMask = 0x1;
3795     DMaskLanes = 1;
3796     MI.getOperand(DMaskIdx).setImm(DMask);
3797   }
3798 
3799   if (BaseOpcode->Atomic) {
3800     Register VData0 = MI.getOperand(2).getReg();
3801     LLT Ty = MRI->getType(VData0);
3802 
3803     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3804     if (Ty.isVector())
3805       return false;
3806 
3807     if (BaseOpcode->AtomicX2) {
3808       Register VData1 = MI.getOperand(3).getReg();
3809       // The two values are packed in one register.
3810       LLT PackedTy = LLT::vector(2, Ty);
3811       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3812       MI.getOperand(2).setReg(Concat.getReg(0));
3813       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3814     }
3815   }
3816 
3817   int CorrectedNumVAddrs = NumVAddrs;
3818 
3819   // Optimize _L to _LZ when _L is zero
3820   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3821         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3822     const ConstantFP *ConstantLod;
3823     const int LodIdx = AddrIdx + NumVAddrs - 1;
3824 
3825     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3826       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3827         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3828         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3829           LZMappingInfo->LZ, ImageDimIntr->Dim);
3830 
3831         // The starting indexes should remain in the same place.
3832         --NumVAddrs;
3833         --CorrectedNumVAddrs;
3834 
3835         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3836           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3837         MI.RemoveOperand(LodIdx);
3838       }
3839     }
3840   }
3841 
3842   // Optimize _mip away, when 'lod' is zero
3843   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3844     int64_t ConstantLod;
3845     const int LodIdx = AddrIdx + NumVAddrs - 1;
3846 
3847     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3848       if (ConstantLod == 0) {
3849         // TODO: Change intrinsic opcode and remove operand instead or replacing
3850         // it with 0, as the _L to _LZ handling is done above.
3851         MI.getOperand(LodIdx).ChangeToImmediate(0);
3852         --CorrectedNumVAddrs;
3853       }
3854     }
3855   }
3856 
3857   // Rewrite the addressing register layout before doing anything else.
3858   if (IsA16 || IsG16) {
3859     if (IsA16) {
3860       // Target must support the feature and gradients need to be 16 bit too
3861       if (!ST.hasA16() || !IsG16)
3862         return false;
3863     } else if (!ST.hasG16())
3864       return false;
3865 
3866     if (NumVAddrs > 1) {
3867       SmallVector<Register, 4> PackedRegs;
3868       // Don't compress addresses for G16
3869       const int PackEndIdx =
3870           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3871       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3872                                   PackEndIdx, NumGradients);
3873 
3874       if (!IsA16) {
3875         // Add uncompressed address
3876         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3877           int AddrReg = MI.getOperand(I).getReg();
3878           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3879           PackedRegs.push_back(AddrReg);
3880         }
3881       }
3882 
3883       // See also below in the non-a16 branch
3884       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3885 
3886       if (!UseNSA && PackedRegs.size() > 1) {
3887         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3888         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3889         PackedRegs[0] = Concat.getReg(0);
3890         PackedRegs.resize(1);
3891       }
3892 
3893       const int NumPacked = PackedRegs.size();
3894       for (int I = 0; I != NumVAddrs; ++I) {
3895         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3896         if (!SrcOp.isReg()) {
3897           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3898           continue;
3899         }
3900 
3901         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3902 
3903         if (I < NumPacked)
3904           SrcOp.setReg(PackedRegs[I]);
3905         else
3906           SrcOp.setReg(AMDGPU::NoRegister);
3907       }
3908     }
3909   } else {
3910     // If the register allocator cannot place the address registers contiguously
3911     // without introducing moves, then using the non-sequential address encoding
3912     // is always preferable, since it saves VALU instructions and is usually a
3913     // wash in terms of code size or even better.
3914     //
3915     // However, we currently have no way of hinting to the register allocator
3916     // that MIMG addresses should be placed contiguously when it is possible to
3917     // do so, so force non-NSA for the common 2-address case as a heuristic.
3918     //
3919     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3920     // allocation when possible.
3921     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3922 
3923     if (!UseNSA && NumVAddrs > 1)
3924       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3925   }
3926 
3927   int Flags = 0;
3928   if (IsA16)
3929     Flags |= 1;
3930   if (IsG16)
3931     Flags |= 2;
3932   MI.addOperand(MachineOperand::CreateImm(Flags));
3933 
3934   if (BaseOpcode->Store) { // No TFE for stores?
3935     // TODO: Handle dmask trim
3936     Register VData = MI.getOperand(1).getReg();
3937     LLT Ty = MRI->getType(VData);
3938     if (!Ty.isVector() || Ty.getElementType() != S16)
3939       return true;
3940 
3941     Register RepackedReg = handleD16VData(B, *MRI, VData);
3942     if (RepackedReg != VData) {
3943       MI.getOperand(1).setReg(RepackedReg);
3944     }
3945 
3946     return true;
3947   }
3948 
3949   Register DstReg = MI.getOperand(0).getReg();
3950   LLT Ty = MRI->getType(DstReg);
3951   const LLT EltTy = Ty.getScalarType();
3952   const bool IsD16 = Ty.getScalarType() == S16;
3953   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3954 
3955   // Confirm that the return type is large enough for the dmask specified
3956   if (NumElts < DMaskLanes)
3957     return false;
3958 
3959   if (NumElts > 4 || DMaskLanes > 4)
3960     return false;
3961 
3962   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3963   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3964 
3965   // The raw dword aligned data component of the load. The only legal cases
3966   // where this matters should be when using the packed D16 format, for
3967   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3968   LLT RoundedTy;
3969 
3970   // S32 vector to to cover all data, plus TFE result element.
3971   LLT TFETy;
3972 
3973   // Register type to use for each loaded component. Will be S32 or V2S16.
3974   LLT RegTy;
3975 
3976   if (IsD16 && ST.hasUnpackedD16VMem()) {
3977     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3978     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3979     RegTy = S32;
3980   } else {
3981     unsigned EltSize = EltTy.getSizeInBits();
3982     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3983     unsigned RoundedSize = 32 * RoundedElts;
3984     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3985     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3986     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3987   }
3988 
3989   // The return type does not need adjustment.
3990   // TODO: Should we change s16 case to s32 or <2 x s16>?
3991   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3992     return true;
3993 
3994   Register Dst1Reg;
3995 
3996   // Insert after the instruction.
3997   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3998 
3999   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4000   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4001   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4002   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4003 
4004   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4005 
4006   MI.getOperand(0).setReg(NewResultReg);
4007 
4008   // In the IR, TFE is supposed to be used with a 2 element struct return
4009   // type. The intruction really returns these two values in one contiguous
4010   // register, with one additional dword beyond the loaded data. Rewrite the
4011   // return type to use a single register result.
4012 
4013   if (IsTFE) {
4014     Dst1Reg = MI.getOperand(1).getReg();
4015     if (MRI->getType(Dst1Reg) != S32)
4016       return false;
4017 
4018     // TODO: Make sure the TFE operand bit is set.
4019     MI.RemoveOperand(1);
4020 
4021     // Handle the easy case that requires no repack instructions.
4022     if (Ty == S32) {
4023       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4024       return true;
4025     }
4026   }
4027 
4028   // Now figure out how to copy the new result register back into the old
4029   // result.
4030   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4031 
4032   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4033 
4034   if (ResultNumRegs == 1) {
4035     assert(!IsTFE);
4036     ResultRegs[0] = NewResultReg;
4037   } else {
4038     // We have to repack into a new vector of some kind.
4039     for (int I = 0; I != NumDataRegs; ++I)
4040       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4041     B.buildUnmerge(ResultRegs, NewResultReg);
4042 
4043     // Drop the final TFE element to get the data part. The TFE result is
4044     // directly written to the right place already.
4045     if (IsTFE)
4046       ResultRegs.resize(NumDataRegs);
4047   }
4048 
4049   // For an s16 scalar result, we form an s32 result with a truncate regardless
4050   // of packed vs. unpacked.
4051   if (IsD16 && !Ty.isVector()) {
4052     B.buildTrunc(DstReg, ResultRegs[0]);
4053     return true;
4054   }
4055 
4056   // Avoid a build/concat_vector of 1 entry.
4057   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4058     B.buildBitcast(DstReg, ResultRegs[0]);
4059     return true;
4060   }
4061 
4062   assert(Ty.isVector());
4063 
4064   if (IsD16) {
4065     // For packed D16 results with TFE enabled, all the data components are
4066     // S32. Cast back to the expected type.
4067     //
4068     // TODO: We don't really need to use load s32 elements. We would only need one
4069     // cast for the TFE result if a multiple of v2s16 was used.
4070     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4071       for (Register &Reg : ResultRegs)
4072         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4073     } else if (ST.hasUnpackedD16VMem()) {
4074       for (Register &Reg : ResultRegs)
4075         Reg = B.buildTrunc(S16, Reg).getReg(0);
4076     }
4077   }
4078 
4079   auto padWithUndef = [&](LLT Ty, int NumElts) {
4080     if (NumElts == 0)
4081       return;
4082     Register Undef = B.buildUndef(Ty).getReg(0);
4083     for (int I = 0; I != NumElts; ++I)
4084       ResultRegs.push_back(Undef);
4085   };
4086 
4087   // Pad out any elements eliminated due to the dmask.
4088   LLT ResTy = MRI->getType(ResultRegs[0]);
4089   if (!ResTy.isVector()) {
4090     padWithUndef(ResTy, NumElts - ResultRegs.size());
4091     B.buildBuildVector(DstReg, ResultRegs);
4092     return true;
4093   }
4094 
4095   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4096   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4097 
4098   // Deal with the one annoying legal case.
4099   const LLT V3S16 = LLT::vector(3, 16);
4100   if (Ty == V3S16) {
4101     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4102     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4103     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4104     return true;
4105   }
4106 
4107   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4108   B.buildConcatVectors(DstReg, ResultRegs);
4109   return true;
4110 }
4111 
4112 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4113   MachineInstr &MI, MachineIRBuilder &B,
4114   GISelChangeObserver &Observer) const {
4115   Register Dst = MI.getOperand(0).getReg();
4116   LLT Ty = B.getMRI()->getType(Dst);
4117   unsigned Size = Ty.getSizeInBits();
4118   MachineFunction &MF = B.getMF();
4119 
4120   Observer.changingInstr(MI);
4121 
4122   // FIXME: We don't really need this intermediate instruction. The intrinsic
4123   // should be fixed to have a memory operand. Since it's readnone, we're not
4124   // allowed to add one.
4125   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4126   MI.RemoveOperand(1); // Remove intrinsic ID
4127 
4128   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4129   // TODO: Should this use datalayout alignment?
4130   const unsigned MemSize = (Size + 7) / 8;
4131   const Align MemAlign(4);
4132   MachineMemOperand *MMO = MF.getMachineMemOperand(
4133       MachinePointerInfo(),
4134       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4135           MachineMemOperand::MOInvariant,
4136       MemSize, MemAlign);
4137   MI.addMemOperand(MF, MMO);
4138 
4139   // There are no 96-bit result scalar loads, but widening to 128-bit should
4140   // always be legal. We may need to restore this to a 96-bit result if it turns
4141   // out this needs to be converted to a vector load during RegBankSelect.
4142   if (!isPowerOf2_32(Size)) {
4143     LegalizerHelper Helper(MF, *this, Observer, B);
4144 
4145     if (Ty.isVector())
4146       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4147     else
4148       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4149   }
4150 
4151   Observer.changedInstr(MI);
4152   return true;
4153 }
4154 
4155 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4156                                                 MachineRegisterInfo &MRI,
4157                                                 MachineIRBuilder &B) const {
4158   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4159   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4160       !ST.isTrapHandlerEnabled()) {
4161     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4162   } else {
4163     // Pass queue pointer to trap handler as input, and insert trap instruction
4164     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4165     const ArgDescriptor *Arg =
4166         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4167     if (!Arg)
4168       return false;
4169     MachineRegisterInfo &MRI = *B.getMRI();
4170     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4171     Register LiveIn = getLiveInRegister(
4172         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4173         /*InsertLiveInCopy=*/false);
4174     if (!loadInputValue(LiveIn, B, Arg))
4175       return false;
4176     B.buildCopy(SGPR01, LiveIn);
4177     B.buildInstr(AMDGPU::S_TRAP)
4178         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4179         .addReg(SGPR01, RegState::Implicit);
4180   }
4181 
4182   MI.eraseFromParent();
4183   return true;
4184 }
4185 
4186 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4187     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4188   // Is non-HSA path or trap-handler disabled? then, report a warning
4189   // accordingly
4190   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4191       !ST.isTrapHandlerEnabled()) {
4192     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4193                                      "debugtrap handler not supported",
4194                                      MI.getDebugLoc(), DS_Warning);
4195     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4196     Ctx.diagnose(NoTrap);
4197   } else {
4198     // Insert debug-trap instruction
4199     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4200   }
4201 
4202   MI.eraseFromParent();
4203   return true;
4204 }
4205 
4206 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4207                                             MachineInstr &MI) const {
4208   MachineIRBuilder &B = Helper.MIRBuilder;
4209   MachineRegisterInfo &MRI = *B.getMRI();
4210 
4211   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4212   auto IntrID = MI.getIntrinsicID();
4213   switch (IntrID) {
4214   case Intrinsic::amdgcn_if:
4215   case Intrinsic::amdgcn_else: {
4216     MachineInstr *Br = nullptr;
4217     MachineBasicBlock *UncondBrTarget = nullptr;
4218     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4219       const SIRegisterInfo *TRI
4220         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4221 
4222       Register Def = MI.getOperand(1).getReg();
4223       Register Use = MI.getOperand(3).getReg();
4224 
4225       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4226       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4227       if (IntrID == Intrinsic::amdgcn_if) {
4228         B.buildInstr(AMDGPU::SI_IF)
4229           .addDef(Def)
4230           .addUse(Use)
4231           .addMBB(UncondBrTarget);
4232       } else {
4233         B.buildInstr(AMDGPU::SI_ELSE)
4234           .addDef(Def)
4235           .addUse(Use)
4236           .addMBB(UncondBrTarget)
4237           .addImm(0);
4238       }
4239 
4240       if (Br) {
4241         Br->getOperand(0).setMBB(CondBrTarget);
4242       } else {
4243         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4244         // since we're swapping branch targets it needs to be reinserted.
4245         // FIXME: IRTranslator should probably not do this
4246         B.buildBr(*CondBrTarget);
4247       }
4248 
4249       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4250       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4251       MI.eraseFromParent();
4252       BrCond->eraseFromParent();
4253       return true;
4254     }
4255 
4256     return false;
4257   }
4258   case Intrinsic::amdgcn_loop: {
4259     MachineInstr *Br = nullptr;
4260     MachineBasicBlock *UncondBrTarget = nullptr;
4261     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4262       const SIRegisterInfo *TRI
4263         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4264 
4265       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4266       Register Reg = MI.getOperand(2).getReg();
4267 
4268       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4269       B.buildInstr(AMDGPU::SI_LOOP)
4270         .addUse(Reg)
4271         .addMBB(UncondBrTarget);
4272 
4273       if (Br)
4274         Br->getOperand(0).setMBB(CondBrTarget);
4275       else
4276         B.buildBr(*CondBrTarget);
4277 
4278       MI.eraseFromParent();
4279       BrCond->eraseFromParent();
4280       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4281       return true;
4282     }
4283 
4284     return false;
4285   }
4286   case Intrinsic::amdgcn_kernarg_segment_ptr:
4287     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4288       // This only makes sense to call in a kernel, so just lower to null.
4289       B.buildConstant(MI.getOperand(0).getReg(), 0);
4290       MI.eraseFromParent();
4291       return true;
4292     }
4293 
4294     return legalizePreloadedArgIntrin(
4295       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4296   case Intrinsic::amdgcn_implicitarg_ptr:
4297     return legalizeImplicitArgPtr(MI, MRI, B);
4298   case Intrinsic::amdgcn_workitem_id_x:
4299     return legalizePreloadedArgIntrin(MI, MRI, B,
4300                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4301   case Intrinsic::amdgcn_workitem_id_y:
4302     return legalizePreloadedArgIntrin(MI, MRI, B,
4303                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4304   case Intrinsic::amdgcn_workitem_id_z:
4305     return legalizePreloadedArgIntrin(MI, MRI, B,
4306                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4307   case Intrinsic::amdgcn_workgroup_id_x:
4308     return legalizePreloadedArgIntrin(MI, MRI, B,
4309                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4310   case Intrinsic::amdgcn_workgroup_id_y:
4311     return legalizePreloadedArgIntrin(MI, MRI, B,
4312                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4313   case Intrinsic::amdgcn_workgroup_id_z:
4314     return legalizePreloadedArgIntrin(MI, MRI, B,
4315                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4316   case Intrinsic::amdgcn_dispatch_ptr:
4317     return legalizePreloadedArgIntrin(MI, MRI, B,
4318                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4319   case Intrinsic::amdgcn_queue_ptr:
4320     return legalizePreloadedArgIntrin(MI, MRI, B,
4321                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4322   case Intrinsic::amdgcn_implicit_buffer_ptr:
4323     return legalizePreloadedArgIntrin(
4324       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4325   case Intrinsic::amdgcn_dispatch_id:
4326     return legalizePreloadedArgIntrin(MI, MRI, B,
4327                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4328   case Intrinsic::amdgcn_fdiv_fast:
4329     return legalizeFDIVFastIntrin(MI, MRI, B);
4330   case Intrinsic::amdgcn_is_shared:
4331     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4332   case Intrinsic::amdgcn_is_private:
4333     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4334   case Intrinsic::amdgcn_wavefrontsize: {
4335     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4336     MI.eraseFromParent();
4337     return true;
4338   }
4339   case Intrinsic::amdgcn_s_buffer_load:
4340     return legalizeSBufferLoad(MI, B, Helper.Observer);
4341   case Intrinsic::amdgcn_raw_buffer_store:
4342   case Intrinsic::amdgcn_struct_buffer_store:
4343     return legalizeBufferStore(MI, MRI, B, false, false);
4344   case Intrinsic::amdgcn_raw_buffer_store_format:
4345   case Intrinsic::amdgcn_struct_buffer_store_format:
4346     return legalizeBufferStore(MI, MRI, B, false, true);
4347   case Intrinsic::amdgcn_raw_tbuffer_store:
4348   case Intrinsic::amdgcn_struct_tbuffer_store:
4349     return legalizeBufferStore(MI, MRI, B, true, true);
4350   case Intrinsic::amdgcn_raw_buffer_load:
4351   case Intrinsic::amdgcn_struct_buffer_load:
4352     return legalizeBufferLoad(MI, MRI, B, false, false);
4353   case Intrinsic::amdgcn_raw_buffer_load_format:
4354   case Intrinsic::amdgcn_struct_buffer_load_format:
4355     return legalizeBufferLoad(MI, MRI, B, true, false);
4356   case Intrinsic::amdgcn_raw_tbuffer_load:
4357   case Intrinsic::amdgcn_struct_tbuffer_load:
4358     return legalizeBufferLoad(MI, MRI, B, true, true);
4359   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4360   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4361   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4362   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4363   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4364   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4365   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4366   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4367   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4368   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4369   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4370   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4371   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4372   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4373   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4374   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4375   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4376   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4377   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4378   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4379   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4380   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4381   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4382   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4383   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4384   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4385     return legalizeBufferAtomic(MI, B, IntrID);
4386   case Intrinsic::amdgcn_atomic_inc:
4387     return legalizeAtomicIncDec(MI, B, true);
4388   case Intrinsic::amdgcn_atomic_dec:
4389     return legalizeAtomicIncDec(MI, B, false);
4390   case Intrinsic::trap:
4391     return legalizeTrapIntrinsic(MI, MRI, B);
4392   case Intrinsic::debugtrap:
4393     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4394   default: {
4395     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4396             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4397       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4398     return true;
4399   }
4400   }
4401 
4402   return true;
4403 }
4404