1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Round the number of elements to the next power of two elements
40 static LLT getPow2VectorType(LLT Ty) {
41   unsigned NElts = Ty.getNumElements();
42   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
43   return Ty.changeNumElements(Pow2NElts);
44 }
45 
46 // Round the number of bits to the next power of two bits
47 static LLT getPow2ScalarType(LLT Ty) {
48   unsigned Bits = Ty.getSizeInBits();
49   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
50   return LLT::scalar(Pow2Bits);
51 }
52 
53 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
54   return [=](const LegalityQuery &Query) {
55     const LLT Ty = Query.Types[TypeIdx];
56     return Ty.isVector() &&
57            Ty.getNumElements() % 2 != 0 &&
58            Ty.getElementType().getSizeInBits() < 32 &&
59            Ty.getSizeInBits() % 32 != 0;
60   };
61 }
62 
63 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     const LLT EltTy = Ty.getScalarType();
67     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
68   };
69 }
70 
71 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
72   return [=](const LegalityQuery &Query) {
73     const LLT Ty = Query.Types[TypeIdx];
74     const LLT EltTy = Ty.getElementType();
75     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
76   };
77 }
78 
79 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     const LLT EltTy = Ty.getElementType();
83     unsigned Size = Ty.getSizeInBits();
84     unsigned Pieces = (Size + 63) / 64;
85     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
86     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
87   };
88 }
89 
90 // Increase the number of vector elements to reach the next multiple of 32-bit
91 // type.
92 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95 
96     const LLT EltTy = Ty.getElementType();
97     const int Size = Ty.getSizeInBits();
98     const int EltSize = EltTy.getSizeInBits();
99     const int NextMul32 = (Size + 31) / 32;
100 
101     assert(EltSize < 32);
102 
103     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
104     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
105   };
106 }
107 
108 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
109   return [=](const LegalityQuery &Query) {
110     const LLT Ty = Query.Types[TypeIdx];
111     unsigned Size = Ty.getSizeInBits();
112 
113     LLT CoercedTy;
114     if (Size < 32) {
115       // <2 x s8> -> s16
116       assert(Size == 16);
117       CoercedTy = LLT::scalar(16);
118     } else
119       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
120 
121     return std::make_pair(TypeIdx, CoercedTy);
122   };
123 }
124 
125 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
126   return [=](const LegalityQuery &Query) {
127     const LLT QueryTy = Query.Types[TypeIdx];
128     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
129   };
130 }
131 
132 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
133   return [=](const LegalityQuery &Query) {
134     const LLT QueryTy = Query.Types[TypeIdx];
135     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
136   };
137 }
138 
139 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
140   return [=](const LegalityQuery &Query) {
141     const LLT QueryTy = Query.Types[TypeIdx];
142     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
143   };
144 }
145 
146 static bool isRegisterSize(unsigned Size) {
147   return Size % 32 == 0 && Size <= 1024;
148 }
149 
150 static bool isRegisterVectorElementType(LLT EltTy) {
151   const int EltSize = EltTy.getSizeInBits();
152   return EltSize == 16 || EltSize % 32 == 0;
153 }
154 
155 static bool isRegisterVectorType(LLT Ty) {
156   const int EltSize = Ty.getElementType().getSizeInBits();
157   return EltSize == 32 || EltSize == 64 ||
158          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159          EltSize == 128 || EltSize == 256;
160 }
161 
162 static bool isRegisterType(LLT Ty) {
163   if (!isRegisterSize(Ty.getSizeInBits()))
164     return false;
165 
166   if (Ty.isVector())
167     return isRegisterVectorType(Ty);
168 
169   return true;
170 }
171 
172 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
173 // v2s16.
174 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     return isRegisterType(Query.Types[TypeIdx]);
177   };
178 }
179 
180 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
181   return [=](const LegalityQuery &Query) {
182     const LLT QueryTy = Query.Types[TypeIdx];
183     if (!QueryTy.isVector())
184       return false;
185     const LLT EltTy = QueryTy.getElementType();
186     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
187   };
188 }
189 
190 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT Ty = Query.Types[TypeIdx];
193     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
194            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
195   };
196 }
197 
198 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
199 // handle some operations by just promoting the register during
200 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
201 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
202                                     bool IsLoad) {
203   switch (AS) {
204   case AMDGPUAS::PRIVATE_ADDRESS:
205     // FIXME: Private element size.
206     return 32;
207   case AMDGPUAS::LOCAL_ADDRESS:
208     return ST.useDS128() ? 128 : 64;
209   case AMDGPUAS::GLOBAL_ADDRESS:
210   case AMDGPUAS::CONSTANT_ADDRESS:
211   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
212     // Treat constant and global as identical. SMRD loads are sometimes usable for
213     // global loads (ideally constant address space should be eliminated)
214     // depending on the context. Legality cannot be context dependent, but
215     // RegBankSelect can split the load as necessary depending on the pointer
216     // register bank/uniformity and if the memory is invariant or not written in a
217     // kernel.
218     return IsLoad ? 512 : 128;
219   default:
220     // Flat addresses may contextually need to be split to 32-bit parts if they
221     // may alias scratch depending on the subtarget.
222     return 128;
223   }
224 }
225 
226 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
227                                  const LegalityQuery &Query,
228                                  unsigned Opcode) {
229   const LLT Ty = Query.Types[0];
230 
231   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
232   const bool IsLoad = Opcode != AMDGPU::G_STORE;
233 
234   unsigned RegSize = Ty.getSizeInBits();
235   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
236   unsigned Align = Query.MMODescrs[0].AlignInBits;
237   unsigned AS = Query.Types[1].getAddressSpace();
238 
239   // All of these need to be custom lowered to cast the pointer operand.
240   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
241     return false;
242 
243   // TODO: We should be able to widen loads if the alignment is high enough, but
244   // we also need to modify the memory access size.
245 #if 0
246   // Accept widening loads based on alignment.
247   if (IsLoad && MemSize < Size)
248     MemSize = std::max(MemSize, Align);
249 #endif
250 
251   // Only 1-byte and 2-byte to 32-bit extloads are valid.
252   if (MemSize != RegSize && RegSize != 32)
253     return false;
254 
255   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
256     return false;
257 
258   switch (MemSize) {
259   case 8:
260   case 16:
261   case 32:
262   case 64:
263   case 128:
264     break;
265   case 96:
266     if (!ST.hasDwordx3LoadStores())
267       return false;
268     break;
269   case 256:
270   case 512:
271     // These may contextually need to be broken down.
272     break;
273   default:
274     return false;
275   }
276 
277   assert(RegSize >= MemSize);
278 
279   if (Align < MemSize) {
280     const SITargetLowering *TLI = ST.getTargetLowering();
281     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
282       return false;
283   }
284 
285   return true;
286 }
287 
288 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
289                              unsigned Opcode) {
290   const LLT Ty = Query.Types[0];
291   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode);
292 }
293 
294 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
295                                          const GCNTargetMachine &TM)
296   :  ST(ST_) {
297   using namespace TargetOpcode;
298 
299   auto GetAddrSpacePtr = [&TM](unsigned AS) {
300     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
301   };
302 
303   const LLT S1 = LLT::scalar(1);
304   const LLT S16 = LLT::scalar(16);
305   const LLT S32 = LLT::scalar(32);
306   const LLT S64 = LLT::scalar(64);
307   const LLT S128 = LLT::scalar(128);
308   const LLT S256 = LLT::scalar(256);
309   const LLT S512 = LLT::scalar(512);
310   const LLT S1024 = LLT::scalar(1024);
311 
312   const LLT V2S16 = LLT::vector(2, 16);
313   const LLT V4S16 = LLT::vector(4, 16);
314 
315   const LLT V2S32 = LLT::vector(2, 32);
316   const LLT V3S32 = LLT::vector(3, 32);
317   const LLT V4S32 = LLT::vector(4, 32);
318   const LLT V5S32 = LLT::vector(5, 32);
319   const LLT V6S32 = LLT::vector(6, 32);
320   const LLT V7S32 = LLT::vector(7, 32);
321   const LLT V8S32 = LLT::vector(8, 32);
322   const LLT V9S32 = LLT::vector(9, 32);
323   const LLT V10S32 = LLT::vector(10, 32);
324   const LLT V11S32 = LLT::vector(11, 32);
325   const LLT V12S32 = LLT::vector(12, 32);
326   const LLT V13S32 = LLT::vector(13, 32);
327   const LLT V14S32 = LLT::vector(14, 32);
328   const LLT V15S32 = LLT::vector(15, 32);
329   const LLT V16S32 = LLT::vector(16, 32);
330   const LLT V32S32 = LLT::vector(32, 32);
331 
332   const LLT V2S64 = LLT::vector(2, 64);
333   const LLT V3S64 = LLT::vector(3, 64);
334   const LLT V4S64 = LLT::vector(4, 64);
335   const LLT V5S64 = LLT::vector(5, 64);
336   const LLT V6S64 = LLT::vector(6, 64);
337   const LLT V7S64 = LLT::vector(7, 64);
338   const LLT V8S64 = LLT::vector(8, 64);
339   const LLT V16S64 = LLT::vector(16, 64);
340 
341   std::initializer_list<LLT> AllS32Vectors =
342     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
343      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
344   std::initializer_list<LLT> AllS64Vectors =
345     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
346 
347   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
348   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
349   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
350   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
351   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
352   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
353   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
354 
355   const LLT CodePtr = FlatPtr;
356 
357   const std::initializer_list<LLT> AddrSpaces64 = {
358     GlobalPtr, ConstantPtr, FlatPtr
359   };
360 
361   const std::initializer_list<LLT> AddrSpaces32 = {
362     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
363   };
364 
365   const std::initializer_list<LLT> FPTypesBase = {
366     S32, S64
367   };
368 
369   const std::initializer_list<LLT> FPTypes16 = {
370     S32, S64, S16
371   };
372 
373   const std::initializer_list<LLT> FPTypesPK16 = {
374     S32, S64, S16, V2S16
375   };
376 
377   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
378 
379   setAction({G_BRCOND, S1}, Legal); // VCC branches
380   setAction({G_BRCOND, S32}, Legal); // SCC branches
381 
382   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
383   // elements for v3s16
384   getActionDefinitionsBuilder(G_PHI)
385     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
386     .legalFor(AllS32Vectors)
387     .legalFor(AllS64Vectors)
388     .legalFor(AddrSpaces64)
389     .legalFor(AddrSpaces32)
390     .clampScalar(0, S32, S256)
391     .widenScalarToNextPow2(0, 32)
392     .clampMaxNumElements(0, S32, 16)
393     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
394     .legalIf(isPointer(0));
395 
396   if (ST.hasVOP3PInsts()) {
397     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
398       .legalFor({S32, S16, V2S16})
399       .clampScalar(0, S16, S32)
400       .clampMaxNumElements(0, S16, 2)
401       .scalarize(0)
402       .widenScalarToNextPow2(0, 32);
403   } else if (ST.has16BitInsts()) {
404     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
405       .legalFor({S32, S16})
406       .clampScalar(0, S16, S32)
407       .scalarize(0)
408       .widenScalarToNextPow2(0, 32);
409   } else {
410     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
411       .legalFor({S32})
412       .clampScalar(0, S32, S32)
413       .scalarize(0);
414   }
415 
416   // FIXME: Not really legal. Placeholder for custom lowering.
417   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
418     .customFor({S32, S64})
419     .clampScalar(0, S32, S64)
420     .widenScalarToNextPow2(0, 32)
421     .scalarize(0);
422 
423   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
424     .legalFor({S32})
425     .clampScalar(0, S32, S32)
426     .scalarize(0);
427 
428   // Report legal for any types we can handle anywhere. For the cases only legal
429   // on the SALU, RegBankSelect will be able to re-legalize.
430   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
431     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
432     .clampScalar(0, S32, S64)
433     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
434     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
435     .widenScalarToNextPow2(0)
436     .scalarize(0);
437 
438   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
439                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
440     .legalFor({{S32, S1}, {S32, S32}})
441     .minScalar(0, S32)
442     // TODO: .scalarize(0)
443     .lower();
444 
445   getActionDefinitionsBuilder(G_BITCAST)
446     // Don't worry about the size constraint.
447     .legalIf(all(isRegisterType(0), isRegisterType(1)))
448     .lower();
449 
450 
451   getActionDefinitionsBuilder(G_CONSTANT)
452     .legalFor({S1, S32, S64, S16, GlobalPtr,
453                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
454     .clampScalar(0, S32, S64)
455     .widenScalarToNextPow2(0)
456     .legalIf(isPointer(0));
457 
458   getActionDefinitionsBuilder(G_FCONSTANT)
459     .legalFor({S32, S64, S16})
460     .clampScalar(0, S16, S64);
461 
462   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
463       .legalIf(isRegisterType(0))
464       // s1 and s16 are special cases because they have legal operations on
465       // them, but don't really occupy registers in the normal way.
466       .legalFor({S1, S16})
467       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
468       .clampScalarOrElt(0, S32, S1024)
469       .widenScalarToNextPow2(0, 32)
470       .clampMaxNumElements(0, S32, 16);
471 
472   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
473 
474   // If the amount is divergent, we have to do a wave reduction to get the
475   // maximum value, so this is expanded during RegBankSelect.
476   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
477     .legalFor({{PrivatePtr, S32}});
478 
479   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
480     .unsupportedFor({PrivatePtr})
481     .custom();
482   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
483 
484   auto &FPOpActions = getActionDefinitionsBuilder(
485     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
486     .legalFor({S32, S64});
487   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
488     .customFor({S32, S64});
489   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
490     .customFor({S32, S64});
491 
492   if (ST.has16BitInsts()) {
493     if (ST.hasVOP3PInsts())
494       FPOpActions.legalFor({S16, V2S16});
495     else
496       FPOpActions.legalFor({S16});
497 
498     TrigActions.customFor({S16});
499     FDIVActions.customFor({S16});
500   }
501 
502   auto &MinNumMaxNum = getActionDefinitionsBuilder({
503       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
504 
505   if (ST.hasVOP3PInsts()) {
506     MinNumMaxNum.customFor(FPTypesPK16)
507       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
508       .clampMaxNumElements(0, S16, 2)
509       .clampScalar(0, S16, S64)
510       .scalarize(0);
511   } else if (ST.has16BitInsts()) {
512     MinNumMaxNum.customFor(FPTypes16)
513       .clampScalar(0, S16, S64)
514       .scalarize(0);
515   } else {
516     MinNumMaxNum.customFor(FPTypesBase)
517       .clampScalar(0, S32, S64)
518       .scalarize(0);
519   }
520 
521   if (ST.hasVOP3PInsts())
522     FPOpActions.clampMaxNumElements(0, S16, 2);
523 
524   FPOpActions
525     .scalarize(0)
526     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
527 
528   TrigActions
529     .scalarize(0)
530     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
531 
532   FDIVActions
533     .scalarize(0)
534     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
535 
536   getActionDefinitionsBuilder({G_FNEG, G_FABS})
537     .legalFor(FPTypesPK16)
538     .clampMaxNumElements(0, S16, 2)
539     .scalarize(0)
540     .clampScalar(0, S16, S64);
541 
542   if (ST.has16BitInsts()) {
543     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
544       .legalFor({S32, S64, S16})
545       .scalarize(0)
546       .clampScalar(0, S16, S64);
547   } else {
548     getActionDefinitionsBuilder(G_FSQRT)
549       .legalFor({S32, S64})
550       .scalarize(0)
551       .clampScalar(0, S32, S64);
552 
553     if (ST.hasFractBug()) {
554       getActionDefinitionsBuilder(G_FFLOOR)
555         .customFor({S64})
556         .legalFor({S32, S64})
557         .scalarize(0)
558         .clampScalar(0, S32, S64);
559     } else {
560       getActionDefinitionsBuilder(G_FFLOOR)
561         .legalFor({S32, S64})
562         .scalarize(0)
563         .clampScalar(0, S32, S64);
564     }
565   }
566 
567   getActionDefinitionsBuilder(G_FPTRUNC)
568     .legalFor({{S32, S64}, {S16, S32}})
569     .scalarize(0)
570     .lower();
571 
572   getActionDefinitionsBuilder(G_FPEXT)
573     .legalFor({{S64, S32}, {S32, S16}})
574     .lowerFor({{S64, S16}}) // FIXME: Implement
575     .scalarize(0);
576 
577   getActionDefinitionsBuilder(G_FSUB)
578       // Use actual fsub instruction
579       .legalFor({S32})
580       // Must use fadd + fneg
581       .lowerFor({S64, S16, V2S16})
582       .scalarize(0)
583       .clampScalar(0, S32, S64);
584 
585   // Whether this is legal depends on the floating point mode for the function.
586   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
587   if (ST.hasMadF16())
588     FMad.customFor({S32, S16});
589   else
590     FMad.customFor({S32});
591   FMad.scalarize(0)
592       .lower();
593 
594   // TODO: Do we need to clamp maximum bitwidth?
595   getActionDefinitionsBuilder(G_TRUNC)
596     .legalIf(isScalar(0))
597     .legalFor({{V2S16, V2S32}})
598     .clampMaxNumElements(0, S16, 2)
599     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
600     // situations (like an invalid implicit use), we don't want to infinite loop
601     // in the legalizer.
602     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
603     .alwaysLegal();
604 
605   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
606     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
607                {S32, S1}, {S64, S1}, {S16, S1}})
608     .scalarize(0)
609     .clampScalar(0, S32, S64)
610     .widenScalarToNextPow2(1, 32);
611 
612   // TODO: Split s1->s64 during regbankselect for VALU.
613   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
614     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
615     .lowerFor({{S32, S64}})
616     .lowerIf(typeIs(1, S1))
617     .customFor({{S64, S64}});
618   if (ST.has16BitInsts())
619     IToFP.legalFor({{S16, S16}});
620   IToFP.clampScalar(1, S32, S64)
621        .scalarize(0)
622        .widenScalarToNextPow2(1);
623 
624   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
625     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
626     .customFor({{S64, S64}});
627   if (ST.has16BitInsts())
628     FPToI.legalFor({{S16, S16}});
629   else
630     FPToI.minScalar(1, S32);
631 
632   FPToI.minScalar(0, S32)
633        .scalarize(0)
634        .lower();
635 
636   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
637     .scalarize(0)
638     .lower();
639 
640   if (ST.has16BitInsts()) {
641     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
642       .legalFor({S16, S32, S64})
643       .clampScalar(0, S16, S64)
644       .scalarize(0);
645   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
646     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
647       .legalFor({S32, S64})
648       .clampScalar(0, S32, S64)
649       .scalarize(0);
650   } else {
651     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
652       .legalFor({S32})
653       .customFor({S64})
654       .clampScalar(0, S32, S64)
655       .scalarize(0);
656   }
657 
658   // FIXME: Clamp offset operand.
659   getActionDefinitionsBuilder(G_PTR_ADD)
660     .legalIf(isPointer(0))
661     .scalarize(0);
662 
663   getActionDefinitionsBuilder(G_PTRMASK)
664     .legalIf(typeInSet(1, {S64, S32}))
665     .minScalar(1, S32)
666     .maxScalarIf(sizeIs(0, 32), 1, S32)
667     .maxScalarIf(sizeIs(0, 64), 1, S64)
668     .scalarize(0);
669 
670   auto &CmpBuilder =
671     getActionDefinitionsBuilder(G_ICMP)
672     // The compare output type differs based on the register bank of the output,
673     // so make both s1 and s32 legal.
674     //
675     // Scalar compares producing output in scc will be promoted to s32, as that
676     // is the allocatable register type that will be needed for the copy from
677     // scc. This will be promoted during RegBankSelect, and we assume something
678     // before that won't try to use s32 result types.
679     //
680     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
681     // bank.
682     .legalForCartesianProduct(
683       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
684     .legalForCartesianProduct(
685       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
686   if (ST.has16BitInsts()) {
687     CmpBuilder.legalFor({{S1, S16}});
688   }
689 
690   CmpBuilder
691     .widenScalarToNextPow2(1)
692     .clampScalar(1, S32, S64)
693     .scalarize(0)
694     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
695 
696   getActionDefinitionsBuilder(G_FCMP)
697     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
698     .widenScalarToNextPow2(1)
699     .clampScalar(1, S32, S64)
700     .scalarize(0);
701 
702   // FIXME: fpow has a selection pattern that should move to custom lowering.
703   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
704   if (ST.has16BitInsts())
705     Exp2Ops.legalFor({S32, S16});
706   else
707     Exp2Ops.legalFor({S32});
708   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
709   Exp2Ops.scalarize(0);
710 
711   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
712   if (ST.has16BitInsts())
713     ExpOps.customFor({{S32}, {S16}});
714   else
715     ExpOps.customFor({S32});
716   ExpOps.clampScalar(0, MinScalarFPTy, S32)
717         .scalarize(0);
718 
719   // The 64-bit versions produce 32-bit results, but only on the SALU.
720   getActionDefinitionsBuilder(G_CTPOP)
721     .legalFor({{S32, S32}, {S32, S64}})
722     .clampScalar(0, S32, S32)
723     .clampScalar(1, S32, S64)
724     .scalarize(0)
725     .widenScalarToNextPow2(0, 32)
726     .widenScalarToNextPow2(1, 32);
727 
728   // The hardware instructions return a different result on 0 than the generic
729   // instructions expect. The hardware produces -1, but these produce the
730   // bitwidth.
731   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
732     .scalarize(0)
733     .clampScalar(0, S32, S32)
734     .clampScalar(1, S32, S64)
735     .widenScalarToNextPow2(0, 32)
736     .widenScalarToNextPow2(1, 32)
737     .lower();
738 
739   // The 64-bit versions produce 32-bit results, but only on the SALU.
740   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
741     .legalFor({{S32, S32}, {S32, S64}})
742     .clampScalar(0, S32, S32)
743     .clampScalar(1, S32, S64)
744     .scalarize(0)
745     .widenScalarToNextPow2(0, 32)
746     .widenScalarToNextPow2(1, 32);
747 
748   getActionDefinitionsBuilder(G_BITREVERSE)
749     .legalFor({S32})
750     .clampScalar(0, S32, S32)
751     .scalarize(0);
752 
753   if (ST.has16BitInsts()) {
754     getActionDefinitionsBuilder(G_BSWAP)
755       .legalFor({S16, S32, V2S16})
756       .clampMaxNumElements(0, S16, 2)
757       // FIXME: Fixing non-power-of-2 before clamp is workaround for
758       // narrowScalar limitation.
759       .widenScalarToNextPow2(0)
760       .clampScalar(0, S16, S32)
761       .scalarize(0);
762 
763     if (ST.hasVOP3PInsts()) {
764       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
765         .legalFor({S32, S16, V2S16})
766         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
767         .clampMaxNumElements(0, S16, 2)
768         .minScalar(0, S16)
769         .widenScalarToNextPow2(0)
770         .scalarize(0)
771         .lower();
772     } else {
773       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
774         .legalFor({S32, S16})
775         .widenScalarToNextPow2(0)
776         .minScalar(0, S16)
777         .scalarize(0)
778         .lower();
779     }
780   } else {
781     // TODO: Should have same legality without v_perm_b32
782     getActionDefinitionsBuilder(G_BSWAP)
783       .legalFor({S32})
784       .lowerIf(scalarNarrowerThan(0, 32))
785       // FIXME: Fixing non-power-of-2 before clamp is workaround for
786       // narrowScalar limitation.
787       .widenScalarToNextPow2(0)
788       .maxScalar(0, S32)
789       .scalarize(0)
790       .lower();
791 
792     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
793       .legalFor({S32})
794       .minScalar(0, S32)
795       .widenScalarToNextPow2(0)
796       .scalarize(0)
797       .lower();
798   }
799 
800   getActionDefinitionsBuilder(G_INTTOPTR)
801     // List the common cases
802     .legalForCartesianProduct(AddrSpaces64, {S64})
803     .legalForCartesianProduct(AddrSpaces32, {S32})
804     .scalarize(0)
805     // Accept any address space as long as the size matches
806     .legalIf(sameSize(0, 1))
807     .widenScalarIf(smallerThan(1, 0),
808       [](const LegalityQuery &Query) {
809         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
810       })
811     .narrowScalarIf(largerThan(1, 0),
812       [](const LegalityQuery &Query) {
813         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
814       });
815 
816   getActionDefinitionsBuilder(G_PTRTOINT)
817     // List the common cases
818     .legalForCartesianProduct(AddrSpaces64, {S64})
819     .legalForCartesianProduct(AddrSpaces32, {S32})
820     .scalarize(0)
821     // Accept any address space as long as the size matches
822     .legalIf(sameSize(0, 1))
823     .widenScalarIf(smallerThan(0, 1),
824       [](const LegalityQuery &Query) {
825         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
826       })
827     .narrowScalarIf(
828       largerThan(0, 1),
829       [](const LegalityQuery &Query) {
830         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
831       });
832 
833   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
834     .scalarize(0)
835     .custom();
836 
837   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
838                                     bool IsLoad) -> bool {
839     const LLT DstTy = Query.Types[0];
840 
841     // Split vector extloads.
842     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
843     unsigned Align = Query.MMODescrs[0].AlignInBits;
844 
845     if (MemSize < DstTy.getSizeInBits())
846       MemSize = std::max(MemSize, Align);
847 
848     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
849       return true;
850 
851     const LLT PtrTy = Query.Types[1];
852     unsigned AS = PtrTy.getAddressSpace();
853     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
854       return true;
855 
856     // Catch weird sized loads that don't evenly divide into the access sizes
857     // TODO: May be able to widen depending on alignment etc.
858     unsigned NumRegs = (MemSize + 31) / 32;
859     if (NumRegs == 3) {
860       if (!ST.hasDwordx3LoadStores())
861         return true;
862     } else {
863       // If the alignment allows, these should have been widened.
864       if (!isPowerOf2_32(NumRegs))
865         return true;
866     }
867 
868     if (Align < MemSize) {
869       const SITargetLowering *TLI = ST.getTargetLowering();
870       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
871     }
872 
873     return false;
874   };
875 
876   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
877                                          unsigned Opc) -> bool {
878     unsigned Size = Query.Types[0].getSizeInBits();
879     if (isPowerOf2_32(Size))
880       return false;
881 
882     if (Size == 96 && ST.hasDwordx3LoadStores())
883       return false;
884 
885     unsigned AddrSpace = Query.Types[1].getAddressSpace();
886     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
887       return false;
888 
889     unsigned Align = Query.MMODescrs[0].AlignInBits;
890     unsigned RoundedSize = NextPowerOf2(Size);
891     return (Align >= RoundedSize);
892   };
893 
894   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
895   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
896   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
897 
898   // TODO: Refine based on subtargets which support unaligned access or 128-bit
899   // LDS
900   // TODO: Unsupported flat for SI.
901 
902   for (unsigned Op : {G_LOAD, G_STORE}) {
903     const bool IsStore = Op == G_STORE;
904 
905     auto &Actions = getActionDefinitionsBuilder(Op);
906     // Whitelist some common cases.
907     // TODO: Does this help compile time at all?
908     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
909                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
910                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
911                                       {S64, GlobalPtr, 64, GlobalAlign32},
912                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
913                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
914                                       {S32, GlobalPtr, 8, GlobalAlign8},
915                                       {S32, GlobalPtr, 16, GlobalAlign16},
916 
917                                       {S32, LocalPtr, 32, 32},
918                                       {S64, LocalPtr, 64, 32},
919                                       {V2S32, LocalPtr, 64, 32},
920                                       {S32, LocalPtr, 8, 8},
921                                       {S32, LocalPtr, 16, 16},
922                                       {V2S16, LocalPtr, 32, 32},
923 
924                                       {S32, PrivatePtr, 32, 32},
925                                       {S32, PrivatePtr, 8, 8},
926                                       {S32, PrivatePtr, 16, 16},
927                                       {V2S16, PrivatePtr, 32, 32},
928 
929                                       {S32, ConstantPtr, 32, GlobalAlign32},
930                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
931                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
932                                       {S64, ConstantPtr, 64, GlobalAlign32},
933                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
934     Actions.legalIf(
935       [=](const LegalityQuery &Query) -> bool {
936         return isLoadStoreLegal(ST, Query, Op);
937       });
938 
939     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
940     // 64-bits.
941     //
942     // TODO: Should generalize bitcast action into coerce, which will also cover
943     // inserting addrspacecasts.
944     Actions.customIf(typeIs(1, Constant32Ptr));
945 
946     // Turn any illegal element vectors into something easier to deal
947     // with. These will ultimately produce 32-bit scalar shifts to extract the
948     // parts anyway.
949     //
950     // For odd 16-bit element vectors, prefer to split those into pieces with
951     // 16-bit vector parts.
952     Actions.bitcastIf(
953       [=](const LegalityQuery &Query) -> bool {
954         LLT Ty = Query.Types[0];
955         return Ty.isVector() &&
956                isRegisterSize(Ty.getSizeInBits()) &&
957                !isRegisterVectorElementType(Ty.getElementType());
958       }, bitcastToRegisterType(0));
959 
960     Actions
961         .customIf(typeIs(1, Constant32Ptr))
962         // Widen suitably aligned loads by loading extra elements.
963         .moreElementsIf([=](const LegalityQuery &Query) {
964             const LLT Ty = Query.Types[0];
965             return Op == G_LOAD && Ty.isVector() &&
966                    shouldWidenLoadResult(Query, Op);
967           }, moreElementsToNextPow2(0))
968         .widenScalarIf([=](const LegalityQuery &Query) {
969             const LLT Ty = Query.Types[0];
970             return Op == G_LOAD && !Ty.isVector() &&
971                    shouldWidenLoadResult(Query, Op);
972           }, widenScalarOrEltToNextPow2(0))
973         .narrowScalarIf(
974             [=](const LegalityQuery &Query) -> bool {
975               return !Query.Types[0].isVector() &&
976                      needToSplitMemOp(Query, Op == G_LOAD);
977             },
978             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
979               const LLT DstTy = Query.Types[0];
980               const LLT PtrTy = Query.Types[1];
981 
982               const unsigned DstSize = DstTy.getSizeInBits();
983               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
984 
985               // Split extloads.
986               if (DstSize > MemSize)
987                 return std::make_pair(0, LLT::scalar(MemSize));
988 
989               if (!isPowerOf2_32(DstSize)) {
990                 // We're probably decomposing an odd sized store. Try to split
991                 // to the widest type. TODO: Account for alignment. As-is it
992                 // should be OK, since the new parts will be further legalized.
993                 unsigned FloorSize = PowerOf2Floor(DstSize);
994                 return std::make_pair(0, LLT::scalar(FloorSize));
995               }
996 
997               if (DstSize > 32 && (DstSize % 32 != 0)) {
998                 // FIXME: Need a way to specify non-extload of larger size if
999                 // suitably aligned.
1000                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1001               }
1002 
1003               unsigned MaxSize = maxSizeForAddrSpace(ST,
1004                                                      PtrTy.getAddressSpace(),
1005                                                      Op == G_LOAD);
1006               if (MemSize > MaxSize)
1007                 return std::make_pair(0, LLT::scalar(MaxSize));
1008 
1009               unsigned Align = Query.MMODescrs[0].AlignInBits;
1010               return std::make_pair(0, LLT::scalar(Align));
1011             })
1012         .fewerElementsIf(
1013             [=](const LegalityQuery &Query) -> bool {
1014               return Query.Types[0].isVector() &&
1015                      needToSplitMemOp(Query, Op == G_LOAD);
1016             },
1017             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1018               const LLT DstTy = Query.Types[0];
1019               const LLT PtrTy = Query.Types[1];
1020 
1021               LLT EltTy = DstTy.getElementType();
1022               unsigned MaxSize = maxSizeForAddrSpace(ST,
1023                                                      PtrTy.getAddressSpace(),
1024                                                      Op == G_LOAD);
1025 
1026               // FIXME: Handle widened to power of 2 results better. This ends
1027               // up scalarizing.
1028               // FIXME: 3 element stores scalarized on SI
1029 
1030               // Split if it's too large for the address space.
1031               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1032                 unsigned NumElts = DstTy.getNumElements();
1033                 unsigned EltSize = EltTy.getSizeInBits();
1034 
1035                 if (MaxSize % EltSize == 0) {
1036                   return std::make_pair(
1037                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1038                 }
1039 
1040                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1041 
1042                 // FIXME: Refine when odd breakdowns handled
1043                 // The scalars will need to be re-legalized.
1044                 if (NumPieces == 1 || NumPieces >= NumElts ||
1045                     NumElts % NumPieces != 0)
1046                   return std::make_pair(0, EltTy);
1047 
1048                 return std::make_pair(0,
1049                                       LLT::vector(NumElts / NumPieces, EltTy));
1050               }
1051 
1052               // FIXME: We could probably handle weird extending loads better.
1053               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1054               if (DstTy.getSizeInBits() > MemSize)
1055                 return std::make_pair(0, EltTy);
1056 
1057               unsigned EltSize = EltTy.getSizeInBits();
1058               unsigned DstSize = DstTy.getSizeInBits();
1059               if (!isPowerOf2_32(DstSize)) {
1060                 // We're probably decomposing an odd sized store. Try to split
1061                 // to the widest type. TODO: Account for alignment. As-is it
1062                 // should be OK, since the new parts will be further legalized.
1063                 unsigned FloorSize = PowerOf2Floor(DstSize);
1064                 return std::make_pair(
1065                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1066               }
1067 
1068               // Need to split because of alignment.
1069               unsigned Align = Query.MMODescrs[0].AlignInBits;
1070               if (EltSize > Align &&
1071                   (EltSize / Align < DstTy.getNumElements())) {
1072                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1073               }
1074 
1075               // May need relegalization for the scalars.
1076               return std::make_pair(0, EltTy);
1077             })
1078         .minScalar(0, S32);
1079 
1080     if (IsStore)
1081       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1082 
1083     // TODO: Need a bitcast lower option?
1084     Actions
1085         .widenScalarToNextPow2(0)
1086         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1087   }
1088 
1089   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1090                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1091                                                   {S32, GlobalPtr, 16, 2 * 8},
1092                                                   {S32, LocalPtr, 8, 8},
1093                                                   {S32, LocalPtr, 16, 16},
1094                                                   {S32, PrivatePtr, 8, 8},
1095                                                   {S32, PrivatePtr, 16, 16},
1096                                                   {S32, ConstantPtr, 8, 8},
1097                                                   {S32, ConstantPtr, 16, 2 * 8}});
1098   if (ST.hasFlatAddressSpace()) {
1099     ExtLoads.legalForTypesWithMemDesc(
1100         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1101   }
1102 
1103   ExtLoads.clampScalar(0, S32, S32)
1104           .widenScalarToNextPow2(0)
1105           .unsupportedIfMemSizeNotPow2()
1106           .lower();
1107 
1108   auto &Atomics = getActionDefinitionsBuilder(
1109     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1110      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1111      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1112      G_ATOMICRMW_UMIN})
1113     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1114                {S64, GlobalPtr}, {S64, LocalPtr}});
1115   if (ST.hasFlatAddressSpace()) {
1116     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1117   }
1118 
1119   if (ST.hasLDSFPAtomics()) {
1120     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1121       .legalFor({{S32, LocalPtr}});
1122   }
1123 
1124   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1125   // demarshalling
1126   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1127     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1128                 {S32, FlatPtr}, {S64, FlatPtr}})
1129     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1130                {S32, RegionPtr}, {S64, RegionPtr}});
1131   // TODO: Pointer types, any 32-bit or 64-bit vector
1132 
1133   // Condition should be s32 for scalar, s1 for vector.
1134   getActionDefinitionsBuilder(G_SELECT)
1135     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1136           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1137           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1138     .clampScalar(0, S16, S64)
1139     .scalarize(1)
1140     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1141     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1142     .clampMaxNumElements(0, S32, 2)
1143     .clampMaxNumElements(0, LocalPtr, 2)
1144     .clampMaxNumElements(0, PrivatePtr, 2)
1145     .scalarize(0)
1146     .widenScalarToNextPow2(0)
1147     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1148 
1149   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1150   // be more flexible with the shift amount type.
1151   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1152     .legalFor({{S32, S32}, {S64, S32}});
1153   if (ST.has16BitInsts()) {
1154     if (ST.hasVOP3PInsts()) {
1155       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1156             .clampMaxNumElements(0, S16, 2);
1157     } else
1158       Shifts.legalFor({{S16, S16}});
1159 
1160     // TODO: Support 16-bit shift amounts for all types
1161     Shifts.widenScalarIf(
1162       [=](const LegalityQuery &Query) {
1163         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1164         // 32-bit amount.
1165         const LLT ValTy = Query.Types[0];
1166         const LLT AmountTy = Query.Types[1];
1167         return ValTy.getSizeInBits() <= 16 &&
1168                AmountTy.getSizeInBits() < 16;
1169       }, changeTo(1, S16));
1170     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1171     Shifts.clampScalar(1, S32, S32);
1172     Shifts.clampScalar(0, S16, S64);
1173     Shifts.widenScalarToNextPow2(0, 16);
1174   } else {
1175     // Make sure we legalize the shift amount type first, as the general
1176     // expansion for the shifted type will produce much worse code if it hasn't
1177     // been truncated already.
1178     Shifts.clampScalar(1, S32, S32);
1179     Shifts.clampScalar(0, S32, S64);
1180     Shifts.widenScalarToNextPow2(0, 32);
1181   }
1182   Shifts.scalarize(0);
1183 
1184   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1185     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1186     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1187     unsigned IdxTypeIdx = 2;
1188 
1189     getActionDefinitionsBuilder(Op)
1190       .customIf([=](const LegalityQuery &Query) {
1191           const LLT EltTy = Query.Types[EltTypeIdx];
1192           const LLT VecTy = Query.Types[VecTypeIdx];
1193           const LLT IdxTy = Query.Types[IdxTypeIdx];
1194           return (EltTy.getSizeInBits() == 16 ||
1195                   EltTy.getSizeInBits() % 32 == 0) &&
1196                  VecTy.getSizeInBits() % 32 == 0 &&
1197                  VecTy.getSizeInBits() <= 1024 &&
1198                  IdxTy.getSizeInBits() == 32;
1199         })
1200       .clampScalar(EltTypeIdx, S32, S64)
1201       .clampScalar(VecTypeIdx, S32, S64)
1202       .clampScalar(IdxTypeIdx, S32, S32);
1203   }
1204 
1205   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1206     .unsupportedIf([=](const LegalityQuery &Query) {
1207         const LLT &EltTy = Query.Types[1].getElementType();
1208         return Query.Types[0] != EltTy;
1209       });
1210 
1211   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1212     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1213     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1214 
1215     // FIXME: Doesn't handle extract of illegal sizes.
1216     getActionDefinitionsBuilder(Op)
1217       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1218       // FIXME: Multiples of 16 should not be legal.
1219       .legalIf([=](const LegalityQuery &Query) {
1220           const LLT BigTy = Query.Types[BigTyIdx];
1221           const LLT LitTy = Query.Types[LitTyIdx];
1222           return (BigTy.getSizeInBits() % 32 == 0) &&
1223                  (LitTy.getSizeInBits() % 16 == 0);
1224         })
1225       .widenScalarIf(
1226         [=](const LegalityQuery &Query) {
1227           const LLT BigTy = Query.Types[BigTyIdx];
1228           return (BigTy.getScalarSizeInBits() < 16);
1229         },
1230         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1231       .widenScalarIf(
1232         [=](const LegalityQuery &Query) {
1233           const LLT LitTy = Query.Types[LitTyIdx];
1234           return (LitTy.getScalarSizeInBits() < 16);
1235         },
1236         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1237       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1238       .widenScalarToNextPow2(BigTyIdx, 32);
1239 
1240   }
1241 
1242   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1243     .legalForCartesianProduct(AllS32Vectors, {S32})
1244     .legalForCartesianProduct(AllS64Vectors, {S64})
1245     .clampNumElements(0, V16S32, V32S32)
1246     .clampNumElements(0, V2S64, V16S64)
1247     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1248 
1249   if (ST.hasScalarPackInsts()) {
1250     BuildVector
1251       // FIXME: Should probably widen s1 vectors straight to s32
1252       .minScalarOrElt(0, S16)
1253       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1254       .minScalar(1, S32);
1255 
1256     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1257       .legalFor({V2S16, S32})
1258       .lower();
1259     BuildVector.minScalarOrElt(0, S32);
1260   } else {
1261     BuildVector.customFor({V2S16, S16});
1262     BuildVector.minScalarOrElt(0, S32);
1263 
1264     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1265       .customFor({V2S16, S32})
1266       .lower();
1267   }
1268 
1269   BuildVector.legalIf(isRegisterType(0));
1270 
1271   // FIXME: Clamp maximum size
1272   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1273     .legalIf(isRegisterType(0));
1274 
1275   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1276   // pre-legalize.
1277   if (ST.hasVOP3PInsts()) {
1278     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1279       .customFor({V2S16, V2S16})
1280       .lower();
1281   } else
1282     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1283 
1284   // Merge/Unmerge
1285   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1286     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1287     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1288 
1289     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1290       const LLT Ty = Query.Types[TypeIdx];
1291       if (Ty.isVector()) {
1292         const LLT &EltTy = Ty.getElementType();
1293         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1294           return true;
1295         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1296           return true;
1297       }
1298       return false;
1299     };
1300 
1301     auto &Builder = getActionDefinitionsBuilder(Op)
1302       .lowerFor({{S16, V2S16}})
1303       .lowerIf([=](const LegalityQuery &Query) {
1304           const LLT BigTy = Query.Types[BigTyIdx];
1305           return BigTy.getSizeInBits() == 32;
1306         })
1307       // Try to widen to s16 first for small types.
1308       // TODO: Only do this on targets with legal s16 shifts
1309       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1310       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1311       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1312       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1313                            elementTypeIs(1, S16)),
1314                        changeTo(1, V2S16))
1315       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1316       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1317       // valid.
1318       .clampScalar(LitTyIdx, S32, S512)
1319       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1320       // Break up vectors with weird elements into scalars
1321       .fewerElementsIf(
1322         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1323         scalarize(0))
1324       .fewerElementsIf(
1325         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1326         scalarize(1))
1327       .clampScalar(BigTyIdx, S32, S1024);
1328 
1329     if (Op == G_MERGE_VALUES) {
1330       Builder.widenScalarIf(
1331         // TODO: Use 16-bit shifts if legal for 8-bit values?
1332         [=](const LegalityQuery &Query) {
1333           const LLT Ty = Query.Types[LitTyIdx];
1334           return Ty.getSizeInBits() < 32;
1335         },
1336         changeTo(LitTyIdx, S32));
1337     }
1338 
1339     Builder.widenScalarIf(
1340       [=](const LegalityQuery &Query) {
1341         const LLT Ty = Query.Types[BigTyIdx];
1342         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1343           Ty.getSizeInBits() % 16 != 0;
1344       },
1345       [=](const LegalityQuery &Query) {
1346         // Pick the next power of 2, or a multiple of 64 over 128.
1347         // Whichever is smaller.
1348         const LLT &Ty = Query.Types[BigTyIdx];
1349         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1350         if (NewSizeInBits >= 256) {
1351           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1352           if (RoundedTo < NewSizeInBits)
1353             NewSizeInBits = RoundedTo;
1354         }
1355         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1356       })
1357       .legalIf([=](const LegalityQuery &Query) {
1358           const LLT &BigTy = Query.Types[BigTyIdx];
1359           const LLT &LitTy = Query.Types[LitTyIdx];
1360 
1361           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1362             return false;
1363           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1364             return false;
1365 
1366           return BigTy.getSizeInBits() % 16 == 0 &&
1367                  LitTy.getSizeInBits() % 16 == 0 &&
1368                  BigTy.getSizeInBits() <= 1024;
1369         })
1370       // Any vectors left are the wrong size. Scalarize them.
1371       .scalarize(0)
1372       .scalarize(1);
1373   }
1374 
1375   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1376   // RegBankSelect.
1377   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1378     .legalFor({{S32}, {S64}});
1379 
1380   if (ST.hasVOP3PInsts()) {
1381     SextInReg.lowerFor({{V2S16}})
1382       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1383       // get more vector shift opportunities, since we'll get those when
1384       // expanded.
1385       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1386   } else if (ST.has16BitInsts()) {
1387     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1388   } else {
1389     // Prefer to promote to s32 before lowering if we don't have 16-bit
1390     // shifts. This avoid a lot of intermediate truncate and extend operations.
1391     SextInReg.lowerFor({{S32}, {S64}});
1392   }
1393 
1394   SextInReg
1395     .scalarize(0)
1396     .clampScalar(0, S32, S64)
1397     .lower();
1398 
1399   getActionDefinitionsBuilder(G_FSHR)
1400     .legalFor({{S32, S32}})
1401     .scalarize(0)
1402     .lower();
1403 
1404   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1405     .legalFor({S64});
1406 
1407   getActionDefinitionsBuilder({
1408       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1409       G_FCOPYSIGN,
1410 
1411       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1412       G_READ_REGISTER,
1413       G_WRITE_REGISTER,
1414 
1415       G_SADDO, G_SSUBO,
1416 
1417        // TODO: Implement
1418       G_FMINIMUM, G_FMAXIMUM,
1419       G_FSHL
1420     }).lower();
1421 
1422   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1423         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1424         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1425     .unsupported();
1426 
1427   computeTables();
1428   verify(*ST.getInstrInfo());
1429 }
1430 
1431 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1432                                          MachineRegisterInfo &MRI,
1433                                          MachineIRBuilder &B,
1434                                          GISelChangeObserver &Observer) const {
1435   switch (MI.getOpcode()) {
1436   case TargetOpcode::G_ADDRSPACE_CAST:
1437     return legalizeAddrSpaceCast(MI, MRI, B);
1438   case TargetOpcode::G_FRINT:
1439     return legalizeFrint(MI, MRI, B);
1440   case TargetOpcode::G_FCEIL:
1441     return legalizeFceil(MI, MRI, B);
1442   case TargetOpcode::G_INTRINSIC_TRUNC:
1443     return legalizeIntrinsicTrunc(MI, MRI, B);
1444   case TargetOpcode::G_SITOFP:
1445     return legalizeITOFP(MI, MRI, B, true);
1446   case TargetOpcode::G_UITOFP:
1447     return legalizeITOFP(MI, MRI, B, false);
1448   case TargetOpcode::G_FPTOSI:
1449     return legalizeFPTOI(MI, MRI, B, true);
1450   case TargetOpcode::G_FPTOUI:
1451     return legalizeFPTOI(MI, MRI, B, false);
1452   case TargetOpcode::G_FMINNUM:
1453   case TargetOpcode::G_FMAXNUM:
1454   case TargetOpcode::G_FMINNUM_IEEE:
1455   case TargetOpcode::G_FMAXNUM_IEEE:
1456     return legalizeMinNumMaxNum(MI, MRI, B);
1457   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1458     return legalizeExtractVectorElt(MI, MRI, B);
1459   case TargetOpcode::G_INSERT_VECTOR_ELT:
1460     return legalizeInsertVectorElt(MI, MRI, B);
1461   case TargetOpcode::G_SHUFFLE_VECTOR:
1462     return legalizeShuffleVector(MI, MRI, B);
1463   case TargetOpcode::G_FSIN:
1464   case TargetOpcode::G_FCOS:
1465     return legalizeSinCos(MI, MRI, B);
1466   case TargetOpcode::G_GLOBAL_VALUE:
1467     return legalizeGlobalValue(MI, MRI, B);
1468   case TargetOpcode::G_LOAD:
1469     return legalizeLoad(MI, MRI, B, Observer);
1470   case TargetOpcode::G_FMAD:
1471     return legalizeFMad(MI, MRI, B);
1472   case TargetOpcode::G_FDIV:
1473     return legalizeFDIV(MI, MRI, B);
1474   case TargetOpcode::G_UDIV:
1475   case TargetOpcode::G_UREM:
1476     return legalizeUDIV_UREM(MI, MRI, B);
1477   case TargetOpcode::G_SDIV:
1478   case TargetOpcode::G_SREM:
1479     return legalizeSDIV_SREM(MI, MRI, B);
1480   case TargetOpcode::G_ATOMIC_CMPXCHG:
1481     return legalizeAtomicCmpXChg(MI, MRI, B);
1482   case TargetOpcode::G_FLOG:
1483     return legalizeFlog(MI, B, numbers::ln2f);
1484   case TargetOpcode::G_FLOG10:
1485     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1486   case TargetOpcode::G_FEXP:
1487     return legalizeFExp(MI, B);
1488   case TargetOpcode::G_FPOW:
1489     return legalizeFPow(MI, B);
1490   case TargetOpcode::G_FFLOOR:
1491     return legalizeFFloor(MI, MRI, B);
1492   case TargetOpcode::G_BUILD_VECTOR:
1493     return legalizeBuildVector(MI, MRI, B);
1494   default:
1495     return false;
1496   }
1497 
1498   llvm_unreachable("expected switch to return");
1499 }
1500 
1501 Register AMDGPULegalizerInfo::getSegmentAperture(
1502   unsigned AS,
1503   MachineRegisterInfo &MRI,
1504   MachineIRBuilder &B) const {
1505   MachineFunction &MF = B.getMF();
1506   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1507   const LLT S32 = LLT::scalar(32);
1508 
1509   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1510 
1511   if (ST.hasApertureRegs()) {
1512     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1513     // getreg.
1514     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1515         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1516         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1517     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1518         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1519         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1520     unsigned Encoding =
1521         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1522         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1523         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1524 
1525     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1526 
1527     B.buildInstr(AMDGPU::S_GETREG_B32)
1528       .addDef(GetReg)
1529       .addImm(Encoding);
1530     MRI.setType(GetReg, S32);
1531 
1532     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1533     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1534   }
1535 
1536   Register QueuePtr = MRI.createGenericVirtualRegister(
1537     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1538 
1539   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1540   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1541     return Register();
1542 
1543   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1544   // private_segment_aperture_base_hi.
1545   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1546 
1547   // TODO: can we be smarter about machine pointer info?
1548   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1549   MachineMemOperand *MMO = MF.getMachineMemOperand(
1550       PtrInfo,
1551       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1552           MachineMemOperand::MOInvariant,
1553       4, commonAlignment(Align(64), StructOffset));
1554 
1555   Register LoadAddr;
1556 
1557   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1558   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1559 }
1560 
1561 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1562   MachineInstr &MI, MachineRegisterInfo &MRI,
1563   MachineIRBuilder &B) const {
1564   MachineFunction &MF = B.getMF();
1565 
1566   const LLT S32 = LLT::scalar(32);
1567   Register Dst = MI.getOperand(0).getReg();
1568   Register Src = MI.getOperand(1).getReg();
1569 
1570   LLT DstTy = MRI.getType(Dst);
1571   LLT SrcTy = MRI.getType(Src);
1572   unsigned DestAS = DstTy.getAddressSpace();
1573   unsigned SrcAS = SrcTy.getAddressSpace();
1574 
1575   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1576   // vector element.
1577   assert(!DstTy.isVector());
1578 
1579   const AMDGPUTargetMachine &TM
1580     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1581 
1582   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1583   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1584     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1585     return true;
1586   }
1587 
1588   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1589     // Truncate.
1590     B.buildExtract(Dst, Src, 0);
1591     MI.eraseFromParent();
1592     return true;
1593   }
1594 
1595   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1596     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1597     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1598 
1599     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1600     // another. Merge operands are required to be the same type, but creating an
1601     // extra ptrtoint would be kind of pointless.
1602     auto HighAddr = B.buildConstant(
1603       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1604     B.buildMerge(Dst, {Src, HighAddr});
1605     MI.eraseFromParent();
1606     return true;
1607   }
1608 
1609   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1610     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1611            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1612     unsigned NullVal = TM.getNullPointerValue(DestAS);
1613 
1614     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1615     auto FlatNull = B.buildConstant(SrcTy, 0);
1616 
1617     // Extract low 32-bits of the pointer.
1618     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1619 
1620     auto CmpRes =
1621         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1622     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1623 
1624     MI.eraseFromParent();
1625     return true;
1626   }
1627 
1628   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1629     return false;
1630 
1631   if (!ST.hasFlatAddressSpace())
1632     return false;
1633 
1634   auto SegmentNull =
1635       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1636   auto FlatNull =
1637       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1638 
1639   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1640   if (!ApertureReg.isValid())
1641     return false;
1642 
1643   auto CmpRes =
1644       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1645 
1646   // Coerce the type of the low half of the result so we can use merge_values.
1647   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1648 
1649   // TODO: Should we allow mismatched types but matching sizes in merges to
1650   // avoid the ptrtoint?
1651   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1652   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1653 
1654   MI.eraseFromParent();
1655   return true;
1656 }
1657 
1658 bool AMDGPULegalizerInfo::legalizeFrint(
1659   MachineInstr &MI, MachineRegisterInfo &MRI,
1660   MachineIRBuilder &B) const {
1661   Register Src = MI.getOperand(1).getReg();
1662   LLT Ty = MRI.getType(Src);
1663   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1664 
1665   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1666   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1667 
1668   auto C1 = B.buildFConstant(Ty, C1Val);
1669   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1670 
1671   // TODO: Should this propagate fast-math-flags?
1672   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1673   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1674 
1675   auto C2 = B.buildFConstant(Ty, C2Val);
1676   auto Fabs = B.buildFAbs(Ty, Src);
1677 
1678   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1679   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1680   return true;
1681 }
1682 
1683 bool AMDGPULegalizerInfo::legalizeFceil(
1684   MachineInstr &MI, MachineRegisterInfo &MRI,
1685   MachineIRBuilder &B) const {
1686 
1687   const LLT S1 = LLT::scalar(1);
1688   const LLT S64 = LLT::scalar(64);
1689 
1690   Register Src = MI.getOperand(1).getReg();
1691   assert(MRI.getType(Src) == S64);
1692 
1693   // result = trunc(src)
1694   // if (src > 0.0 && src != result)
1695   //   result += 1.0
1696 
1697   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1698 
1699   const auto Zero = B.buildFConstant(S64, 0.0);
1700   const auto One = B.buildFConstant(S64, 1.0);
1701   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1702   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1703   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1704   auto Add = B.buildSelect(S64, And, One, Zero);
1705 
1706   // TODO: Should this propagate fast-math-flags?
1707   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1708   return true;
1709 }
1710 
1711 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1712                                               MachineIRBuilder &B) {
1713   const unsigned FractBits = 52;
1714   const unsigned ExpBits = 11;
1715   LLT S32 = LLT::scalar(32);
1716 
1717   auto Const0 = B.buildConstant(S32, FractBits - 32);
1718   auto Const1 = B.buildConstant(S32, ExpBits);
1719 
1720   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1721     .addUse(Const0.getReg(0))
1722     .addUse(Const1.getReg(0));
1723 
1724   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1725 }
1726 
1727 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1728   MachineInstr &MI, MachineRegisterInfo &MRI,
1729   MachineIRBuilder &B) const {
1730   const LLT S1 = LLT::scalar(1);
1731   const LLT S32 = LLT::scalar(32);
1732   const LLT S64 = LLT::scalar(64);
1733 
1734   Register Src = MI.getOperand(1).getReg();
1735   assert(MRI.getType(Src) == S64);
1736 
1737   // TODO: Should this use extract since the low half is unused?
1738   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1739   Register Hi = Unmerge.getReg(1);
1740 
1741   // Extract the upper half, since this is where we will find the sign and
1742   // exponent.
1743   auto Exp = extractF64Exponent(Hi, B);
1744 
1745   const unsigned FractBits = 52;
1746 
1747   // Extract the sign bit.
1748   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1749   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1750 
1751   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1752 
1753   const auto Zero32 = B.buildConstant(S32, 0);
1754 
1755   // Extend back to 64-bits.
1756   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1757 
1758   auto Shr = B.buildAShr(S64, FractMask, Exp);
1759   auto Not = B.buildNot(S64, Shr);
1760   auto Tmp0 = B.buildAnd(S64, Src, Not);
1761   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1762 
1763   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1764   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1765 
1766   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1767   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1768   return true;
1769 }
1770 
1771 bool AMDGPULegalizerInfo::legalizeITOFP(
1772   MachineInstr &MI, MachineRegisterInfo &MRI,
1773   MachineIRBuilder &B, bool Signed) const {
1774 
1775   Register Dst = MI.getOperand(0).getReg();
1776   Register Src = MI.getOperand(1).getReg();
1777 
1778   const LLT S64 = LLT::scalar(64);
1779   const LLT S32 = LLT::scalar(32);
1780 
1781   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1782 
1783   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1784 
1785   auto CvtHi = Signed ?
1786     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1787     B.buildUITOFP(S64, Unmerge.getReg(1));
1788 
1789   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1790 
1791   auto ThirtyTwo = B.buildConstant(S32, 32);
1792   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1793     .addUse(CvtHi.getReg(0))
1794     .addUse(ThirtyTwo.getReg(0));
1795 
1796   // TODO: Should this propagate fast-math-flags?
1797   B.buildFAdd(Dst, LdExp, CvtLo);
1798   MI.eraseFromParent();
1799   return true;
1800 }
1801 
1802 // TODO: Copied from DAG implementation. Verify logic and document how this
1803 // actually works.
1804 bool AMDGPULegalizerInfo::legalizeFPTOI(
1805   MachineInstr &MI, MachineRegisterInfo &MRI,
1806   MachineIRBuilder &B, bool Signed) const {
1807 
1808   Register Dst = MI.getOperand(0).getReg();
1809   Register Src = MI.getOperand(1).getReg();
1810 
1811   const LLT S64 = LLT::scalar(64);
1812   const LLT S32 = LLT::scalar(32);
1813 
1814   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1815 
1816   unsigned Flags = MI.getFlags();
1817 
1818   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1819   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1820   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1821 
1822   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1823   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1824   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1825 
1826   auto Hi = Signed ?
1827     B.buildFPTOSI(S32, FloorMul) :
1828     B.buildFPTOUI(S32, FloorMul);
1829   auto Lo = B.buildFPTOUI(S32, Fma);
1830 
1831   B.buildMerge(Dst, { Lo, Hi });
1832   MI.eraseFromParent();
1833 
1834   return true;
1835 }
1836 
1837 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1838   MachineInstr &MI, MachineRegisterInfo &MRI,
1839   MachineIRBuilder &B) const {
1840   MachineFunction &MF = B.getMF();
1841   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1842 
1843   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1844                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1845 
1846   // With ieee_mode disabled, the instructions have the correct behavior
1847   // already for G_FMINNUM/G_FMAXNUM
1848   if (!MFI->getMode().IEEE)
1849     return !IsIEEEOp;
1850 
1851   if (IsIEEEOp)
1852     return true;
1853 
1854   MachineIRBuilder HelperBuilder(MI);
1855   GISelObserverWrapper DummyObserver;
1856   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1857   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1858 }
1859 
1860 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1861   MachineInstr &MI, MachineRegisterInfo &MRI,
1862   MachineIRBuilder &B) const {
1863   // TODO: Should move some of this into LegalizerHelper.
1864 
1865   // TODO: Promote dynamic indexing of s16 to s32
1866 
1867   // FIXME: Artifact combiner probably should have replaced the truncated
1868   // constant before this, so we shouldn't need
1869   // getConstantVRegValWithLookThrough.
1870   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1871     MI.getOperand(2).getReg(), MRI);
1872   if (!IdxVal) // Dynamic case will be selected to register indexing.
1873     return true;
1874 
1875   Register Dst = MI.getOperand(0).getReg();
1876   Register Vec = MI.getOperand(1).getReg();
1877 
1878   LLT VecTy = MRI.getType(Vec);
1879   LLT EltTy = VecTy.getElementType();
1880   assert(EltTy == MRI.getType(Dst));
1881 
1882   if (IdxVal->Value < VecTy.getNumElements())
1883     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1884   else
1885     B.buildUndef(Dst);
1886 
1887   MI.eraseFromParent();
1888   return true;
1889 }
1890 
1891 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1892   MachineInstr &MI, MachineRegisterInfo &MRI,
1893   MachineIRBuilder &B) const {
1894   // TODO: Should move some of this into LegalizerHelper.
1895 
1896   // TODO: Promote dynamic indexing of s16 to s32
1897 
1898   // FIXME: Artifact combiner probably should have replaced the truncated
1899   // constant before this, so we shouldn't need
1900   // getConstantVRegValWithLookThrough.
1901   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1902     MI.getOperand(3).getReg(), MRI);
1903   if (!IdxVal) // Dynamic case will be selected to register indexing.
1904     return true;
1905 
1906   Register Dst = MI.getOperand(0).getReg();
1907   Register Vec = MI.getOperand(1).getReg();
1908   Register Ins = MI.getOperand(2).getReg();
1909 
1910   LLT VecTy = MRI.getType(Vec);
1911   LLT EltTy = VecTy.getElementType();
1912   assert(EltTy == MRI.getType(Ins));
1913 
1914   if (IdxVal->Value < VecTy.getNumElements())
1915     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1916   else
1917     B.buildUndef(Dst);
1918 
1919   MI.eraseFromParent();
1920   return true;
1921 }
1922 
1923 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1924   MachineInstr &MI, MachineRegisterInfo &MRI,
1925   MachineIRBuilder &B) const {
1926   const LLT V2S16 = LLT::vector(2, 16);
1927 
1928   Register Dst = MI.getOperand(0).getReg();
1929   Register Src0 = MI.getOperand(1).getReg();
1930   LLT DstTy = MRI.getType(Dst);
1931   LLT SrcTy = MRI.getType(Src0);
1932 
1933   if (SrcTy == V2S16 && DstTy == V2S16 &&
1934       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1935     return true;
1936 
1937   MachineIRBuilder HelperBuilder(MI);
1938   GISelObserverWrapper DummyObserver;
1939   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1940   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1941 }
1942 
1943 bool AMDGPULegalizerInfo::legalizeSinCos(
1944   MachineInstr &MI, MachineRegisterInfo &MRI,
1945   MachineIRBuilder &B) const {
1946 
1947   Register DstReg = MI.getOperand(0).getReg();
1948   Register SrcReg = MI.getOperand(1).getReg();
1949   LLT Ty = MRI.getType(DstReg);
1950   unsigned Flags = MI.getFlags();
1951 
1952   Register TrigVal;
1953   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1954   if (ST.hasTrigReducedRange()) {
1955     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1956     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1957       .addUse(MulVal.getReg(0))
1958       .setMIFlags(Flags).getReg(0);
1959   } else
1960     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1961 
1962   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1963     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1964   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1965     .addUse(TrigVal)
1966     .setMIFlags(Flags);
1967   MI.eraseFromParent();
1968   return true;
1969 }
1970 
1971 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1972   Register DstReg, LLT PtrTy,
1973   MachineIRBuilder &B, const GlobalValue *GV,
1974   unsigned Offset, unsigned GAFlags) const {
1975   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1976   // to the following code sequence:
1977   //
1978   // For constant address space:
1979   //   s_getpc_b64 s[0:1]
1980   //   s_add_u32 s0, s0, $symbol
1981   //   s_addc_u32 s1, s1, 0
1982   //
1983   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1984   //   a fixup or relocation is emitted to replace $symbol with a literal
1985   //   constant, which is a pc-relative offset from the encoding of the $symbol
1986   //   operand to the global variable.
1987   //
1988   // For global address space:
1989   //   s_getpc_b64 s[0:1]
1990   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1991   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1992   //
1993   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1994   //   fixups or relocations are emitted to replace $symbol@*@lo and
1995   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1996   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1997   //   operand to the global variable.
1998   //
1999   // What we want here is an offset from the value returned by s_getpc
2000   // (which is the address of the s_add_u32 instruction) to the global
2001   // variable, but since the encoding of $symbol starts 4 bytes after the start
2002   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2003   // small. This requires us to add 4 to the global variable offset in order to
2004   // compute the correct address.
2005 
2006   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2007 
2008   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2009     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2010 
2011   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2012     .addDef(PCReg);
2013 
2014   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2015   if (GAFlags == SIInstrInfo::MO_NONE)
2016     MIB.addImm(0);
2017   else
2018     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2019 
2020   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2021 
2022   if (PtrTy.getSizeInBits() == 32)
2023     B.buildExtract(DstReg, PCReg, 0);
2024   return true;
2025  }
2026 
2027 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2028   MachineInstr &MI, MachineRegisterInfo &MRI,
2029   MachineIRBuilder &B) const {
2030   Register DstReg = MI.getOperand(0).getReg();
2031   LLT Ty = MRI.getType(DstReg);
2032   unsigned AS = Ty.getAddressSpace();
2033 
2034   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2035   MachineFunction &MF = B.getMF();
2036   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2037 
2038   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2039     if (!MFI->isEntryFunction()) {
2040       const Function &Fn = MF.getFunction();
2041       DiagnosticInfoUnsupported BadLDSDecl(
2042         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2043         DS_Warning);
2044       Fn.getContext().diagnose(BadLDSDecl);
2045 
2046       // We currently don't have a way to correctly allocate LDS objects that
2047       // aren't directly associated with a kernel. We do force inlining of
2048       // functions that use local objects. However, if these dead functions are
2049       // not eliminated, we don't want a compile time error. Just emit a warning
2050       // and a trap, since there should be no callable path here.
2051       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2052       B.buildUndef(DstReg);
2053       MI.eraseFromParent();
2054       return true;
2055     }
2056 
2057     // TODO: We could emit code to handle the initialization somewhere.
2058     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2059       const SITargetLowering *TLI = ST.getTargetLowering();
2060       if (!TLI->shouldUseLDSConstAddress(GV)) {
2061         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2062         return true; // Leave in place;
2063       }
2064 
2065       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2066       MI.eraseFromParent();
2067       return true;
2068     }
2069 
2070     const Function &Fn = MF.getFunction();
2071     DiagnosticInfoUnsupported BadInit(
2072       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2073     Fn.getContext().diagnose(BadInit);
2074     return true;
2075   }
2076 
2077   const SITargetLowering *TLI = ST.getTargetLowering();
2078 
2079   if (TLI->shouldEmitFixup(GV)) {
2080     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2081     MI.eraseFromParent();
2082     return true;
2083   }
2084 
2085   if (TLI->shouldEmitPCReloc(GV)) {
2086     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2087     MI.eraseFromParent();
2088     return true;
2089   }
2090 
2091   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2092   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2093 
2094   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2095       MachinePointerInfo::getGOT(MF),
2096       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2097           MachineMemOperand::MOInvariant,
2098       8 /*Size*/, Align(8));
2099 
2100   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2101 
2102   if (Ty.getSizeInBits() == 32) {
2103     // Truncate if this is a 32-bit constant adrdess.
2104     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2105     B.buildExtract(DstReg, Load, 0);
2106   } else
2107     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2108 
2109   MI.eraseFromParent();
2110   return true;
2111 }
2112 
2113 bool AMDGPULegalizerInfo::legalizeLoad(
2114   MachineInstr &MI, MachineRegisterInfo &MRI,
2115   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2116   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2117   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2118   Observer.changingInstr(MI);
2119   MI.getOperand(1).setReg(Cast.getReg(0));
2120   Observer.changedInstr(MI);
2121   return true;
2122 }
2123 
2124 bool AMDGPULegalizerInfo::legalizeFMad(
2125   MachineInstr &MI, MachineRegisterInfo &MRI,
2126   MachineIRBuilder &B) const {
2127   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2128   assert(Ty.isScalar());
2129 
2130   MachineFunction &MF = B.getMF();
2131   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2132 
2133   // TODO: Always legal with future ftz flag.
2134   // FIXME: Do we need just output?
2135   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2136     return true;
2137   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2138     return true;
2139 
2140   MachineIRBuilder HelperBuilder(MI);
2141   GISelObserverWrapper DummyObserver;
2142   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2143   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2144 }
2145 
2146 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2147   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2148   Register DstReg = MI.getOperand(0).getReg();
2149   Register PtrReg = MI.getOperand(1).getReg();
2150   Register CmpVal = MI.getOperand(2).getReg();
2151   Register NewVal = MI.getOperand(3).getReg();
2152 
2153   assert(SITargetLowering::isFlatGlobalAddrSpace(
2154            MRI.getType(PtrReg).getAddressSpace()) &&
2155          "this should not have been custom lowered");
2156 
2157   LLT ValTy = MRI.getType(CmpVal);
2158   LLT VecTy = LLT::vector(2, ValTy);
2159 
2160   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2161 
2162   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2163     .addDef(DstReg)
2164     .addUse(PtrReg)
2165     .addUse(PackedVal)
2166     .setMemRefs(MI.memoperands());
2167 
2168   MI.eraseFromParent();
2169   return true;
2170 }
2171 
2172 bool AMDGPULegalizerInfo::legalizeFlog(
2173   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2174   Register Dst = MI.getOperand(0).getReg();
2175   Register Src = MI.getOperand(1).getReg();
2176   LLT Ty = B.getMRI()->getType(Dst);
2177   unsigned Flags = MI.getFlags();
2178 
2179   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2180   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2181 
2182   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2183   MI.eraseFromParent();
2184   return true;
2185 }
2186 
2187 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2188                                        MachineIRBuilder &B) const {
2189   Register Dst = MI.getOperand(0).getReg();
2190   Register Src = MI.getOperand(1).getReg();
2191   unsigned Flags = MI.getFlags();
2192   LLT Ty = B.getMRI()->getType(Dst);
2193 
2194   auto K = B.buildFConstant(Ty, numbers::log2e);
2195   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2196   B.buildFExp2(Dst, Mul, Flags);
2197   MI.eraseFromParent();
2198   return true;
2199 }
2200 
2201 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2202                                        MachineIRBuilder &B) const {
2203   Register Dst = MI.getOperand(0).getReg();
2204   Register Src0 = MI.getOperand(1).getReg();
2205   Register Src1 = MI.getOperand(2).getReg();
2206   unsigned Flags = MI.getFlags();
2207   LLT Ty = B.getMRI()->getType(Dst);
2208   const LLT S16 = LLT::scalar(16);
2209   const LLT S32 = LLT::scalar(32);
2210 
2211   if (Ty == S32) {
2212     auto Log = B.buildFLog2(S32, Src0, Flags);
2213     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2214       .addUse(Log.getReg(0))
2215       .addUse(Src1)
2216       .setMIFlags(Flags);
2217     B.buildFExp2(Dst, Mul, Flags);
2218   } else if (Ty == S16) {
2219     // There's no f16 fmul_legacy, so we need to convert for it.
2220     auto Log = B.buildFLog2(S16, Src0, Flags);
2221     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2222     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2223     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2224       .addUse(Ext0.getReg(0))
2225       .addUse(Ext1.getReg(0))
2226       .setMIFlags(Flags);
2227 
2228     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2229   } else
2230     return false;
2231 
2232   MI.eraseFromParent();
2233   return true;
2234 }
2235 
2236 // Find a source register, ignoring any possible source modifiers.
2237 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2238   Register ModSrc = OrigSrc;
2239   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2240     ModSrc = SrcFNeg->getOperand(1).getReg();
2241     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2242       ModSrc = SrcFAbs->getOperand(1).getReg();
2243   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2244     ModSrc = SrcFAbs->getOperand(1).getReg();
2245   return ModSrc;
2246 }
2247 
2248 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2249                                          MachineRegisterInfo &MRI,
2250                                          MachineIRBuilder &B) const {
2251 
2252   const LLT S1 = LLT::scalar(1);
2253   const LLT S64 = LLT::scalar(64);
2254   Register Dst = MI.getOperand(0).getReg();
2255   Register OrigSrc = MI.getOperand(1).getReg();
2256   unsigned Flags = MI.getFlags();
2257   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2258          "this should not have been custom lowered");
2259 
2260   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2261   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2262   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2263   // V_FRACT bug is:
2264   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2265   //
2266   // Convert floor(x) to (x - fract(x))
2267 
2268   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2269     .addUse(OrigSrc)
2270     .setMIFlags(Flags);
2271 
2272   // Give source modifier matching some assistance before obscuring a foldable
2273   // pattern.
2274 
2275   // TODO: We can avoid the neg on the fract? The input sign to fract
2276   // shouldn't matter?
2277   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2278 
2279   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2280 
2281   Register Min = MRI.createGenericVirtualRegister(S64);
2282 
2283   // We don't need to concern ourselves with the snan handling difference, so
2284   // use the one which will directly select.
2285   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2286   if (MFI->getMode().IEEE)
2287     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2288   else
2289     B.buildFMinNum(Min, Fract, Const, Flags);
2290 
2291   Register CorrectedFract = Min;
2292   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2293     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2294     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2295   }
2296 
2297   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2298   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2299 
2300   MI.eraseFromParent();
2301   return true;
2302 }
2303 
2304 // Turn an illegal packed v2s16 build vector into bit operations.
2305 // TODO: This should probably be a bitcast action in LegalizerHelper.
2306 bool AMDGPULegalizerInfo::legalizeBuildVector(
2307   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2308   Register Dst = MI.getOperand(0).getReg();
2309   const LLT S32 = LLT::scalar(32);
2310   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2311 
2312   Register Src0 = MI.getOperand(1).getReg();
2313   Register Src1 = MI.getOperand(2).getReg();
2314   assert(MRI.getType(Src0) == LLT::scalar(16));
2315 
2316   auto Merge = B.buildMerge(S32, {Src0, Src1});
2317   B.buildBitcast(Dst, Merge);
2318 
2319   MI.eraseFromParent();
2320   return true;
2321 }
2322 
2323 // Return the use branch instruction, otherwise null if the usage is invalid.
2324 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2325                                        MachineRegisterInfo &MRI,
2326                                        MachineInstr *&Br,
2327                                        MachineBasicBlock *&UncondBrTarget) {
2328   Register CondDef = MI.getOperand(0).getReg();
2329   if (!MRI.hasOneNonDBGUse(CondDef))
2330     return nullptr;
2331 
2332   MachineBasicBlock *Parent = MI.getParent();
2333   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2334   if (UseMI.getParent() != Parent ||
2335       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2336     return nullptr;
2337 
2338   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2339   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2340   if (Next == Parent->end()) {
2341     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2342     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2343       return nullptr;
2344     UncondBrTarget = &*NextMBB;
2345   } else {
2346     if (Next->getOpcode() != AMDGPU::G_BR)
2347       return nullptr;
2348     Br = &*Next;
2349     UncondBrTarget = Br->getOperand(0).getMBB();
2350   }
2351 
2352   return &UseMI;
2353 }
2354 
2355 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2356                                                MachineRegisterInfo &MRI,
2357                                                Register LiveIn,
2358                                                Register PhyReg) const {
2359   assert(PhyReg.isPhysical() && "Physical register expected");
2360 
2361   // Insert the live-in copy, if required, by defining destination virtual
2362   // register.
2363   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2364   if (!MRI.getVRegDef(LiveIn)) {
2365     // FIXME: Should have scoped insert pt
2366     MachineBasicBlock &OrigInsBB = B.getMBB();
2367     auto OrigInsPt = B.getInsertPt();
2368 
2369     MachineBasicBlock &EntryMBB = B.getMF().front();
2370     EntryMBB.addLiveIn(PhyReg);
2371     B.setInsertPt(EntryMBB, EntryMBB.begin());
2372     B.buildCopy(LiveIn, PhyReg);
2373 
2374     B.setInsertPt(OrigInsBB, OrigInsPt);
2375   }
2376 
2377   return LiveIn;
2378 }
2379 
2380 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2381                                                 MachineRegisterInfo &MRI,
2382                                                 Register PhyReg, LLT Ty,
2383                                                 bool InsertLiveInCopy) const {
2384   assert(PhyReg.isPhysical() && "Physical register expected");
2385 
2386   // Get or create virtual live-in regester
2387   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2388   if (!LiveIn) {
2389     LiveIn = MRI.createGenericVirtualRegister(Ty);
2390     MRI.addLiveIn(PhyReg, LiveIn);
2391   }
2392 
2393   // When the actual true copy required is from virtual register to physical
2394   // register (to be inserted later), live-in copy insertion from physical
2395   // to register virtual register is not required
2396   if (!InsertLiveInCopy)
2397     return LiveIn;
2398 
2399   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2400 }
2401 
2402 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2403     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2404   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2405   const ArgDescriptor *Arg;
2406   const TargetRegisterClass *RC;
2407   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2408   if (!Arg) {
2409     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2410     return nullptr;
2411   }
2412   return Arg;
2413 }
2414 
2415 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2416                                          const ArgDescriptor *Arg) const {
2417   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2418     return false; // TODO: Handle these
2419 
2420   Register SrcReg = Arg->getRegister();
2421   assert(SrcReg.isPhysical() && "Physical register expected");
2422   assert(DstReg.isVirtual() && "Virtual register expected");
2423 
2424   MachineRegisterInfo &MRI = *B.getMRI();
2425 
2426   LLT Ty = MRI.getType(DstReg);
2427   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2428 
2429   if (Arg->isMasked()) {
2430     // TODO: Should we try to emit this once in the entry block?
2431     const LLT S32 = LLT::scalar(32);
2432     const unsigned Mask = Arg->getMask();
2433     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2434 
2435     Register AndMaskSrc = LiveIn;
2436 
2437     if (Shift != 0) {
2438       auto ShiftAmt = B.buildConstant(S32, Shift);
2439       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2440     }
2441 
2442     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2443   } else {
2444     B.buildCopy(DstReg, LiveIn);
2445   }
2446 
2447   return true;
2448 }
2449 
2450 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2451     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2452     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2453 
2454   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2455   if (!Arg)
2456     return false;
2457 
2458   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2459     return false;
2460 
2461   MI.eraseFromParent();
2462   return true;
2463 }
2464 
2465 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2466                                        MachineRegisterInfo &MRI,
2467                                        MachineIRBuilder &B) const {
2468   Register Dst = MI.getOperand(0).getReg();
2469   LLT DstTy = MRI.getType(Dst);
2470   LLT S16 = LLT::scalar(16);
2471   LLT S32 = LLT::scalar(32);
2472   LLT S64 = LLT::scalar(64);
2473 
2474   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2475     return true;
2476 
2477   if (DstTy == S16)
2478     return legalizeFDIV16(MI, MRI, B);
2479   if (DstTy == S32)
2480     return legalizeFDIV32(MI, MRI, B);
2481   if (DstTy == S64)
2482     return legalizeFDIV64(MI, MRI, B);
2483 
2484   return false;
2485 }
2486 
2487 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2488   const LLT S32 = LLT::scalar(32);
2489 
2490   auto Cvt0 = B.buildUITOFP(S32, Src);
2491   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2492   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2493   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2494   return B.buildFPTOUI(S32, Mul).getReg(0);
2495 }
2496 
2497 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2498                                                   Register DstReg,
2499                                                   Register Num,
2500                                                   Register Den,
2501                                                   bool IsRem) const {
2502   const LLT S1 = LLT::scalar(1);
2503   const LLT S32 = LLT::scalar(32);
2504 
2505   // RCP =  URECIP(Den) = 2^32 / Den + e
2506   // e is rounding error.
2507   auto RCP = buildDivRCP(B, Den);
2508 
2509   // RCP_LO = mul(RCP, Den)
2510   auto RCP_LO = B.buildMul(S32, RCP, Den);
2511 
2512   // RCP_HI = mulhu (RCP, Den) */
2513   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2514 
2515   // NEG_RCP_LO = -RCP_LO
2516   auto Zero = B.buildConstant(S32, 0);
2517   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2518 
2519   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2520   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2521   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2522 
2523   // Calculate the rounding error from the URECIP instruction
2524   // E = mulhu(ABS_RCP_LO, RCP)
2525   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2526 
2527   // RCP_A_E = RCP + E
2528   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2529 
2530   // RCP_S_E = RCP - E
2531   auto RCP_S_E = B.buildSub(S32, RCP, E);
2532 
2533   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2534   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2535 
2536   // Quotient = mulhu(Tmp0, Num)stmp
2537   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2538 
2539   // Num_S_Remainder = Quotient * Den
2540   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2541 
2542   // Remainder = Num - Num_S_Remainder
2543   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2544 
2545   // Remainder_GE_Den = Remainder >= Den
2546   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2547 
2548   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2549   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2550                                        Num, Num_S_Remainder);
2551 
2552   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2553   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2554 
2555   // Calculate Division result:
2556 
2557   // Quotient_A_One = Quotient + 1
2558   auto One = B.buildConstant(S32, 1);
2559   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2560 
2561   // Quotient_S_One = Quotient - 1
2562   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2563 
2564   // Div = (Tmp1 ? Quotient_A_One : Quotient)
2565   auto Div = B.buildSelect(S32, Tmp1, Quotient_A_One, Quotient);
2566 
2567   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2568   if (IsRem) {
2569     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2570 
2571     // Calculate Rem result:
2572     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2573 
2574     // Remainder_A_Den = Remainder + Den
2575     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2576 
2577     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2578     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2579 
2580     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2581     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2582   } else {
2583     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2584   }
2585 }
2586 
2587 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2588                                               MachineRegisterInfo &MRI,
2589                                               MachineIRBuilder &B) const {
2590   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2591   Register DstReg = MI.getOperand(0).getReg();
2592   Register Num = MI.getOperand(1).getReg();
2593   Register Den = MI.getOperand(2).getReg();
2594   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2595   MI.eraseFromParent();
2596   return true;
2597 }
2598 
2599 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2600 //
2601 // Return lo, hi of result
2602 //
2603 // %cvt.lo = G_UITOFP Val.lo
2604 // %cvt.hi = G_UITOFP Val.hi
2605 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2606 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2607 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2608 // %mul2 = G_FMUL %mul1, 2**(-32)
2609 // %trunc = G_INTRINSIC_TRUNC %mul2
2610 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2611 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2612 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2613                                                        Register Val) {
2614   const LLT S32 = LLT::scalar(32);
2615   auto Unmerge = B.buildUnmerge(S32, Val);
2616 
2617   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2618   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2619 
2620   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2621                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2622 
2623   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2624   auto Mul1 =
2625       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2626 
2627   // 2**(-32)
2628   auto Mul2 =
2629       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2630   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2631 
2632   // -(2**32)
2633   auto Mad2 = B.buildFMAD(S32, Trunc,
2634                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2635 
2636   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2637   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2638 
2639   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2640 }
2641 
2642 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2643                                               MachineRegisterInfo &MRI,
2644                                               MachineIRBuilder &B) const {
2645   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2646   const LLT S32 = LLT::scalar(32);
2647   const LLT S64 = LLT::scalar(64);
2648   const LLT S1 = LLT::scalar(1);
2649   Register Numer = MI.getOperand(1).getReg();
2650   Register Denom = MI.getOperand(2).getReg();
2651   Register RcpLo, RcpHi;
2652 
2653   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2654 
2655   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2656 
2657   auto Zero64 = B.buildConstant(S64, 0);
2658   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2659 
2660   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2661   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2662 
2663   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2664   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2665   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2666 
2667   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2668   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2669   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2670   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2671 
2672   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2673   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2674   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2675   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2676   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2677 
2678   auto Zero32 = B.buildConstant(S32, 0);
2679   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2680   auto Add2_HiC =
2681       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2682   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2683   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2684 
2685   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2686   Register NumerLo = UnmergeNumer.getReg(0);
2687   Register NumerHi = UnmergeNumer.getReg(1);
2688 
2689   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2690   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2691   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2692   Register Mul3_Lo = UnmergeMul3.getReg(0);
2693   Register Mul3_Hi = UnmergeMul3.getReg(1);
2694   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2695   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2696   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2697   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2698 
2699   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2700   Register DenomLo = UnmergeDenom.getReg(0);
2701   Register DenomHi = UnmergeDenom.getReg(1);
2702 
2703   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2704   auto C1 = B.buildSExt(S32, CmpHi);
2705 
2706   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2707   auto C2 = B.buildSExt(S32, CmpLo);
2708 
2709   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2710   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2711 
2712   // TODO: Here and below portions of the code can be enclosed into if/endif.
2713   // Currently control flow is unconditional and we have 4 selects after
2714   // potential endif to substitute PHIs.
2715 
2716   // if C3 != 0 ...
2717   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2718   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2719   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2720   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2721 
2722   auto One64 = B.buildConstant(S64, 1);
2723   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2724 
2725   auto C4 =
2726       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2727   auto C5 =
2728       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2729   auto C6 = B.buildSelect(
2730       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2731 
2732   // if (C6 != 0)
2733   auto Add4 = B.buildAdd(S64, Add3, One64);
2734   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2735 
2736   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2737   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2738   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2739 
2740   // endif C6
2741   // endif C3
2742 
2743   if (IsDiv) {
2744     auto Sel1 = B.buildSelect(
2745         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2746     B.buildSelect(MI.getOperand(0),
2747                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2748   } else {
2749     auto Sel2 = B.buildSelect(
2750         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2751     B.buildSelect(MI.getOperand(0),
2752                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2753   }
2754 
2755   MI.eraseFromParent();
2756   return true;
2757 }
2758 
2759 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2760                                             MachineRegisterInfo &MRI,
2761                                             MachineIRBuilder &B) const {
2762   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2763   if (Ty == LLT::scalar(32))
2764     return legalizeUDIV_UREM32(MI, MRI, B);
2765   if (Ty == LLT::scalar(64))
2766     return legalizeUDIV_UREM64(MI, MRI, B);
2767   return false;
2768 }
2769 
2770 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2771                                               MachineRegisterInfo &MRI,
2772                                               MachineIRBuilder &B) const {
2773   const LLT S32 = LLT::scalar(32);
2774 
2775   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2776   Register DstReg = MI.getOperand(0).getReg();
2777   Register LHS = MI.getOperand(1).getReg();
2778   Register RHS = MI.getOperand(2).getReg();
2779 
2780   auto ThirtyOne = B.buildConstant(S32, 31);
2781   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2782   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2783 
2784   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2785   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2786 
2787   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2788   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2789 
2790   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2791   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2792 
2793   if (IsRem) {
2794     auto RSign = LHSign; // Remainder sign is the same as LHS
2795     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2796     B.buildSub(DstReg, UDivRem, RSign);
2797   } else {
2798     auto DSign = B.buildXor(S32, LHSign, RHSign);
2799     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2800     B.buildSub(DstReg, UDivRem, DSign);
2801   }
2802 
2803   MI.eraseFromParent();
2804   return true;
2805 }
2806 
2807 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2808                                             MachineRegisterInfo &MRI,
2809                                             MachineIRBuilder &B) const {
2810   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2811     return legalizeSDIV_SREM32(MI, MRI, B);
2812   return false;
2813 }
2814 
2815 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2816                                                  MachineRegisterInfo &MRI,
2817                                                  MachineIRBuilder &B) const {
2818   Register Res = MI.getOperand(0).getReg();
2819   Register LHS = MI.getOperand(1).getReg();
2820   Register RHS = MI.getOperand(2).getReg();
2821 
2822   uint16_t Flags = MI.getFlags();
2823 
2824   LLT ResTy = MRI.getType(Res);
2825   LLT S32 = LLT::scalar(32);
2826   LLT S64 = LLT::scalar(64);
2827 
2828   const MachineFunction &MF = B.getMF();
2829   bool Unsafe =
2830     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2831 
2832   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2833     return false;
2834 
2835   if (!Unsafe && ResTy == S32 &&
2836       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2837     return false;
2838 
2839   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2840     // 1 / x -> RCP(x)
2841     if (CLHS->isExactlyValue(1.0)) {
2842       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2843         .addUse(RHS)
2844         .setMIFlags(Flags);
2845 
2846       MI.eraseFromParent();
2847       return true;
2848     }
2849 
2850     // -1 / x -> RCP( FNEG(x) )
2851     if (CLHS->isExactlyValue(-1.0)) {
2852       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2853       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2854         .addUse(FNeg.getReg(0))
2855         .setMIFlags(Flags);
2856 
2857       MI.eraseFromParent();
2858       return true;
2859     }
2860   }
2861 
2862   // x / y -> x * (1.0 / y)
2863   if (Unsafe) {
2864     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2865       .addUse(RHS)
2866       .setMIFlags(Flags);
2867     B.buildFMul(Res, LHS, RCP, Flags);
2868 
2869     MI.eraseFromParent();
2870     return true;
2871   }
2872 
2873   return false;
2874 }
2875 
2876 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2877                                          MachineRegisterInfo &MRI,
2878                                          MachineIRBuilder &B) const {
2879   Register Res = MI.getOperand(0).getReg();
2880   Register LHS = MI.getOperand(1).getReg();
2881   Register RHS = MI.getOperand(2).getReg();
2882 
2883   uint16_t Flags = MI.getFlags();
2884 
2885   LLT S16 = LLT::scalar(16);
2886   LLT S32 = LLT::scalar(32);
2887 
2888   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2889   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2890 
2891   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2892     .addUse(RHSExt.getReg(0))
2893     .setMIFlags(Flags);
2894 
2895   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2896   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2897 
2898   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2899     .addUse(RDst.getReg(0))
2900     .addUse(RHS)
2901     .addUse(LHS)
2902     .setMIFlags(Flags);
2903 
2904   MI.eraseFromParent();
2905   return true;
2906 }
2907 
2908 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2909 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2910 static void toggleSPDenormMode(bool Enable,
2911                                MachineIRBuilder &B,
2912                                const GCNSubtarget &ST,
2913                                AMDGPU::SIModeRegisterDefaults Mode) {
2914   // Set SP denorm mode to this value.
2915   unsigned SPDenormMode =
2916     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2917 
2918   if (ST.hasDenormModeInst()) {
2919     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2920     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2921 
2922     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2923     B.buildInstr(AMDGPU::S_DENORM_MODE)
2924       .addImm(NewDenormModeValue);
2925 
2926   } else {
2927     // Select FP32 bit field in mode register.
2928     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2929                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2930                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2931 
2932     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2933       .addImm(SPDenormMode)
2934       .addImm(SPDenormModeBitField);
2935   }
2936 }
2937 
2938 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2939                                          MachineRegisterInfo &MRI,
2940                                          MachineIRBuilder &B) const {
2941   Register Res = MI.getOperand(0).getReg();
2942   Register LHS = MI.getOperand(1).getReg();
2943   Register RHS = MI.getOperand(2).getReg();
2944   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2945   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2946 
2947   uint16_t Flags = MI.getFlags();
2948 
2949   LLT S32 = LLT::scalar(32);
2950   LLT S1 = LLT::scalar(1);
2951 
2952   auto One = B.buildFConstant(S32, 1.0f);
2953 
2954   auto DenominatorScaled =
2955     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2956       .addUse(LHS)
2957       .addUse(RHS)
2958       .addImm(0)
2959       .setMIFlags(Flags);
2960   auto NumeratorScaled =
2961     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2962       .addUse(LHS)
2963       .addUse(RHS)
2964       .addImm(1)
2965       .setMIFlags(Flags);
2966 
2967   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2968     .addUse(DenominatorScaled.getReg(0))
2969     .setMIFlags(Flags);
2970   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2971 
2972   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2973   // aren't modeled as reading it.
2974   if (!Mode.allFP32Denormals())
2975     toggleSPDenormMode(true, B, ST, Mode);
2976 
2977   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2978   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2979   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2980   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2981   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2982   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2983 
2984   if (!Mode.allFP32Denormals())
2985     toggleSPDenormMode(false, B, ST, Mode);
2986 
2987   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2988     .addUse(Fma4.getReg(0))
2989     .addUse(Fma1.getReg(0))
2990     .addUse(Fma3.getReg(0))
2991     .addUse(NumeratorScaled.getReg(1))
2992     .setMIFlags(Flags);
2993 
2994   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2995     .addUse(Fmas.getReg(0))
2996     .addUse(RHS)
2997     .addUse(LHS)
2998     .setMIFlags(Flags);
2999 
3000   MI.eraseFromParent();
3001   return true;
3002 }
3003 
3004 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3005                                          MachineRegisterInfo &MRI,
3006                                          MachineIRBuilder &B) const {
3007   Register Res = MI.getOperand(0).getReg();
3008   Register LHS = MI.getOperand(1).getReg();
3009   Register RHS = MI.getOperand(2).getReg();
3010 
3011   uint16_t Flags = MI.getFlags();
3012 
3013   LLT S64 = LLT::scalar(64);
3014   LLT S1 = LLT::scalar(1);
3015 
3016   auto One = B.buildFConstant(S64, 1.0);
3017 
3018   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3019     .addUse(LHS)
3020     .addUse(RHS)
3021     .addImm(0)
3022     .setMIFlags(Flags);
3023 
3024   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3025 
3026   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3027     .addUse(DivScale0.getReg(0))
3028     .setMIFlags(Flags);
3029 
3030   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3031   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3032   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3033 
3034   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3035     .addUse(LHS)
3036     .addUse(RHS)
3037     .addImm(1)
3038     .setMIFlags(Flags);
3039 
3040   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3041   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3042   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3043 
3044   Register Scale;
3045   if (!ST.hasUsableDivScaleConditionOutput()) {
3046     // Workaround a hardware bug on SI where the condition output from div_scale
3047     // is not usable.
3048 
3049     LLT S32 = LLT::scalar(32);
3050 
3051     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3052     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3053     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3054     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3055 
3056     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3057                               Scale1Unmerge.getReg(1));
3058     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3059                               Scale0Unmerge.getReg(1));
3060     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3061   } else {
3062     Scale = DivScale1.getReg(1);
3063   }
3064 
3065   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3066     .addUse(Fma4.getReg(0))
3067     .addUse(Fma3.getReg(0))
3068     .addUse(Mul.getReg(0))
3069     .addUse(Scale)
3070     .setMIFlags(Flags);
3071 
3072   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3073     .addUse(Fmas.getReg(0))
3074     .addUse(RHS)
3075     .addUse(LHS)
3076     .setMIFlags(Flags);
3077 
3078   MI.eraseFromParent();
3079   return true;
3080 }
3081 
3082 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3083                                                  MachineRegisterInfo &MRI,
3084                                                  MachineIRBuilder &B) const {
3085   Register Res = MI.getOperand(0).getReg();
3086   Register LHS = MI.getOperand(2).getReg();
3087   Register RHS = MI.getOperand(3).getReg();
3088   uint16_t Flags = MI.getFlags();
3089 
3090   LLT S32 = LLT::scalar(32);
3091   LLT S1 = LLT::scalar(1);
3092 
3093   auto Abs = B.buildFAbs(S32, RHS, Flags);
3094   const APFloat C0Val(1.0f);
3095 
3096   auto C0 = B.buildConstant(S32, 0x6f800000);
3097   auto C1 = B.buildConstant(S32, 0x2f800000);
3098   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3099 
3100   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3101   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3102 
3103   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3104 
3105   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3106     .addUse(Mul0.getReg(0))
3107     .setMIFlags(Flags);
3108 
3109   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3110 
3111   B.buildFMul(Res, Sel, Mul1, Flags);
3112 
3113   MI.eraseFromParent();
3114   return true;
3115 }
3116 
3117 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3118                                                  MachineRegisterInfo &MRI,
3119                                                  MachineIRBuilder &B) const {
3120   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3121   if (!MFI->isEntryFunction()) {
3122     return legalizePreloadedArgIntrin(MI, MRI, B,
3123                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3124   }
3125 
3126   uint64_t Offset =
3127     ST.getTargetLowering()->getImplicitParameterOffset(
3128       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3129   Register DstReg = MI.getOperand(0).getReg();
3130   LLT DstTy = MRI.getType(DstReg);
3131   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3132 
3133   const ArgDescriptor *Arg;
3134   const TargetRegisterClass *RC;
3135   std::tie(Arg, RC)
3136     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3137   if (!Arg)
3138     return false;
3139 
3140   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3141   if (!loadInputValue(KernargPtrReg, B, Arg))
3142     return false;
3143 
3144   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3145   MI.eraseFromParent();
3146   return true;
3147 }
3148 
3149 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3150                                               MachineRegisterInfo &MRI,
3151                                               MachineIRBuilder &B,
3152                                               unsigned AddrSpace) const {
3153   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3154   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3155   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3156   MI.eraseFromParent();
3157   return true;
3158 }
3159 
3160 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3161 // offset (the offset that is included in bounds checking and swizzling, to be
3162 // split between the instruction's voffset and immoffset fields) and soffset
3163 // (the offset that is excluded from bounds checking and swizzling, to go in
3164 // the instruction's soffset field).  This function takes the first kind of
3165 // offset and figures out how to split it between voffset and immoffset.
3166 std::tuple<Register, unsigned, unsigned>
3167 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3168                                         Register OrigOffset) const {
3169   const unsigned MaxImm = 4095;
3170   Register BaseReg;
3171   unsigned TotalConstOffset;
3172   MachineInstr *OffsetDef;
3173   const LLT S32 = LLT::scalar(32);
3174 
3175   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3176     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3177 
3178   unsigned ImmOffset = TotalConstOffset;
3179 
3180   // If the immediate value is too big for the immoffset field, put the value
3181   // and -4096 into the immoffset field so that the value that is copied/added
3182   // for the voffset field is a multiple of 4096, and it stands more chance
3183   // of being CSEd with the copy/add for another similar load/store.
3184   // However, do not do that rounding down to a multiple of 4096 if that is a
3185   // negative number, as it appears to be illegal to have a negative offset
3186   // in the vgpr, even if adding the immediate offset makes it positive.
3187   unsigned Overflow = ImmOffset & ~MaxImm;
3188   ImmOffset -= Overflow;
3189   if ((int32_t)Overflow < 0) {
3190     Overflow += ImmOffset;
3191     ImmOffset = 0;
3192   }
3193 
3194   if (Overflow != 0) {
3195     if (!BaseReg) {
3196       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3197     } else {
3198       auto OverflowVal = B.buildConstant(S32, Overflow);
3199       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3200     }
3201   }
3202 
3203   if (!BaseReg)
3204     BaseReg = B.buildConstant(S32, 0).getReg(0);
3205 
3206   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3207 }
3208 
3209 /// Handle register layout difference for f16 images for some subtargets.
3210 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3211                                              MachineRegisterInfo &MRI,
3212                                              Register Reg) const {
3213   if (!ST.hasUnpackedD16VMem())
3214     return Reg;
3215 
3216   const LLT S16 = LLT::scalar(16);
3217   const LLT S32 = LLT::scalar(32);
3218   LLT StoreVT = MRI.getType(Reg);
3219   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3220 
3221   auto Unmerge = B.buildUnmerge(S16, Reg);
3222 
3223   SmallVector<Register, 4> WideRegs;
3224   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3225     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3226 
3227   int NumElts = StoreVT.getNumElements();
3228 
3229   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3230 }
3231 
3232 Register AMDGPULegalizerInfo::fixStoreSourceType(
3233   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3234   MachineRegisterInfo *MRI = B.getMRI();
3235   LLT Ty = MRI->getType(VData);
3236 
3237   const LLT S16 = LLT::scalar(16);
3238 
3239   // Fixup illegal register types for i8 stores.
3240   if (Ty == LLT::scalar(8) || Ty == S16) {
3241     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3242     return AnyExt;
3243   }
3244 
3245   if (Ty.isVector()) {
3246     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3247       if (IsFormat)
3248         return handleD16VData(B, *MRI, VData);
3249     }
3250   }
3251 
3252   return VData;
3253 }
3254 
3255 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3256                                               MachineRegisterInfo &MRI,
3257                                               MachineIRBuilder &B,
3258                                               bool IsTyped,
3259                                               bool IsFormat) const {
3260   Register VData = MI.getOperand(1).getReg();
3261   LLT Ty = MRI.getType(VData);
3262   LLT EltTy = Ty.getScalarType();
3263   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3264   const LLT S32 = LLT::scalar(32);
3265 
3266   VData = fixStoreSourceType(B, VData, IsFormat);
3267   Register RSrc = MI.getOperand(2).getReg();
3268 
3269   MachineMemOperand *MMO = *MI.memoperands_begin();
3270   const int MemSize = MMO->getSize();
3271 
3272   unsigned ImmOffset;
3273   unsigned TotalOffset;
3274 
3275   // The typed intrinsics add an immediate after the registers.
3276   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3277 
3278   // The struct intrinsic variants add one additional operand over raw.
3279   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3280   Register VIndex;
3281   int OpOffset = 0;
3282   if (HasVIndex) {
3283     VIndex = MI.getOperand(3).getReg();
3284     OpOffset = 1;
3285   }
3286 
3287   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3288   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3289 
3290   unsigned Format = 0;
3291   if (IsTyped) {
3292     Format = MI.getOperand(5 + OpOffset).getImm();
3293     ++OpOffset;
3294   }
3295 
3296   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3297 
3298   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3299   if (TotalOffset != 0)
3300     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3301 
3302   unsigned Opc;
3303   if (IsTyped) {
3304     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3305                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3306   } else if (IsFormat) {
3307     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3308                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3309   } else {
3310     switch (MemSize) {
3311     case 1:
3312       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3313       break;
3314     case 2:
3315       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3316       break;
3317     default:
3318       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3319       break;
3320     }
3321   }
3322 
3323   if (!VIndex)
3324     VIndex = B.buildConstant(S32, 0).getReg(0);
3325 
3326   auto MIB = B.buildInstr(Opc)
3327     .addUse(VData)              // vdata
3328     .addUse(RSrc)               // rsrc
3329     .addUse(VIndex)             // vindex
3330     .addUse(VOffset)            // voffset
3331     .addUse(SOffset)            // soffset
3332     .addImm(ImmOffset);         // offset(imm)
3333 
3334   if (IsTyped)
3335     MIB.addImm(Format);
3336 
3337   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3338      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3339      .addMemOperand(MMO);
3340 
3341   MI.eraseFromParent();
3342   return true;
3343 }
3344 
3345 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3346                                              MachineRegisterInfo &MRI,
3347                                              MachineIRBuilder &B,
3348                                              bool IsFormat,
3349                                              bool IsTyped) const {
3350   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3351   MachineMemOperand *MMO = *MI.memoperands_begin();
3352   const int MemSize = MMO->getSize();
3353   const LLT S32 = LLT::scalar(32);
3354 
3355   Register Dst = MI.getOperand(0).getReg();
3356   Register RSrc = MI.getOperand(2).getReg();
3357 
3358   // The typed intrinsics add an immediate after the registers.
3359   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3360 
3361   // The struct intrinsic variants add one additional operand over raw.
3362   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3363   Register VIndex;
3364   int OpOffset = 0;
3365   if (HasVIndex) {
3366     VIndex = MI.getOperand(3).getReg();
3367     OpOffset = 1;
3368   }
3369 
3370   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3371   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3372 
3373   unsigned Format = 0;
3374   if (IsTyped) {
3375     Format = MI.getOperand(5 + OpOffset).getImm();
3376     ++OpOffset;
3377   }
3378 
3379   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3380   unsigned ImmOffset;
3381   unsigned TotalOffset;
3382 
3383   LLT Ty = MRI.getType(Dst);
3384   LLT EltTy = Ty.getScalarType();
3385   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3386   const bool Unpacked = ST.hasUnpackedD16VMem();
3387 
3388   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3389   if (TotalOffset != 0)
3390     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3391 
3392   unsigned Opc;
3393 
3394   if (IsTyped) {
3395     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3396                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3397   } else if (IsFormat) {
3398     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3399                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3400   } else {
3401     switch (MemSize) {
3402     case 1:
3403       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3404       break;
3405     case 2:
3406       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3407       break;
3408     default:
3409       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3410       break;
3411     }
3412   }
3413 
3414   Register LoadDstReg;
3415 
3416   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3417   LLT UnpackedTy = Ty.changeElementSize(32);
3418 
3419   if (IsExtLoad)
3420     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3421   else if (Unpacked && IsD16 && Ty.isVector())
3422     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3423   else
3424     LoadDstReg = Dst;
3425 
3426   if (!VIndex)
3427     VIndex = B.buildConstant(S32, 0).getReg(0);
3428 
3429   auto MIB = B.buildInstr(Opc)
3430     .addDef(LoadDstReg)         // vdata
3431     .addUse(RSrc)               // rsrc
3432     .addUse(VIndex)             // vindex
3433     .addUse(VOffset)            // voffset
3434     .addUse(SOffset)            // soffset
3435     .addImm(ImmOffset);         // offset(imm)
3436 
3437   if (IsTyped)
3438     MIB.addImm(Format);
3439 
3440   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3441      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3442      .addMemOperand(MMO);
3443 
3444   if (LoadDstReg != Dst) {
3445     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3446 
3447     // Widen result for extending loads was widened.
3448     if (IsExtLoad)
3449       B.buildTrunc(Dst, LoadDstReg);
3450     else {
3451       // Repack to original 16-bit vector result
3452       // FIXME: G_TRUNC should work, but legalization currently fails
3453       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3454       SmallVector<Register, 4> Repack;
3455       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3456         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3457       B.buildMerge(Dst, Repack);
3458     }
3459   }
3460 
3461   MI.eraseFromParent();
3462   return true;
3463 }
3464 
3465 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3466                                                MachineIRBuilder &B,
3467                                                bool IsInc) const {
3468   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3469                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3470   B.buildInstr(Opc)
3471     .addDef(MI.getOperand(0).getReg())
3472     .addUse(MI.getOperand(2).getReg())
3473     .addUse(MI.getOperand(3).getReg())
3474     .cloneMemRefs(MI);
3475   MI.eraseFromParent();
3476   return true;
3477 }
3478 
3479 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3480   switch (IntrID) {
3481   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3482   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3483     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3484   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3485   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3486     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3487   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3488   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3489     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3490   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3491   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3492     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3493   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3494   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3495     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3496   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3497   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3498     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3499   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3500   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3501     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3502   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3503   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3504     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3505   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3506   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3507     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3508   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3509   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3510     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3511   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3512   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3513     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3514   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3515   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3516     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3517   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3518   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3519     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3520   default:
3521     llvm_unreachable("unhandled atomic opcode");
3522   }
3523 }
3524 
3525 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3526                                                MachineIRBuilder &B,
3527                                                Intrinsic::ID IID) const {
3528   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3529                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3530 
3531   Register Dst = MI.getOperand(0).getReg();
3532   Register VData = MI.getOperand(2).getReg();
3533 
3534   Register CmpVal;
3535   int OpOffset = 0;
3536 
3537   if (IsCmpSwap) {
3538     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3539     ++OpOffset;
3540   }
3541 
3542   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3543   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3544 
3545   // The struct intrinsic variants add one additional operand over raw.
3546   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3547   Register VIndex;
3548   if (HasVIndex) {
3549     VIndex = MI.getOperand(4 + OpOffset).getReg();
3550     ++OpOffset;
3551   }
3552 
3553   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3554   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3555   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3556 
3557   MachineMemOperand *MMO = *MI.memoperands_begin();
3558 
3559   unsigned ImmOffset;
3560   unsigned TotalOffset;
3561   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3562   if (TotalOffset != 0)
3563     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3564 
3565   if (!VIndex)
3566     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3567 
3568   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3569     .addDef(Dst)
3570     .addUse(VData); // vdata
3571 
3572   if (IsCmpSwap)
3573     MIB.addReg(CmpVal);
3574 
3575   MIB.addUse(RSrc)               // rsrc
3576      .addUse(VIndex)             // vindex
3577      .addUse(VOffset)            // voffset
3578      .addUse(SOffset)            // soffset
3579      .addImm(ImmOffset)          // offset(imm)
3580      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3581      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3582      .addMemOperand(MMO);
3583 
3584   MI.eraseFromParent();
3585   return true;
3586 }
3587 
3588 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3589 /// vector with s16 typed elements.
3590 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3591                                         SmallVectorImpl<Register> &PackedAddrs,
3592                                         int AddrIdx, int DimIdx, int EndIdx,
3593                                         int NumGradients) {
3594   const LLT S16 = LLT::scalar(16);
3595   const LLT V2S16 = LLT::vector(2, 16);
3596 
3597   for (int I = AddrIdx; I < EndIdx; ++I) {
3598     MachineOperand &SrcOp = MI.getOperand(I);
3599     if (!SrcOp.isReg())
3600       continue; // _L to _LZ may have eliminated this.
3601 
3602     Register AddrReg = SrcOp.getReg();
3603 
3604     if (I < DimIdx) {
3605       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3606       PackedAddrs.push_back(AddrReg);
3607     } else {
3608       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3609       // derivatives dx/dh and dx/dv are packed with undef.
3610       if (((I + 1) >= EndIdx) ||
3611           ((NumGradients / 2) % 2 == 1 &&
3612            (I == DimIdx + (NumGradients / 2) - 1 ||
3613             I == DimIdx + NumGradients - 1)) ||
3614           // Check for _L to _LZ optimization
3615           !MI.getOperand(I + 1).isReg()) {
3616         PackedAddrs.push_back(
3617             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3618                 .getReg(0));
3619       } else {
3620         PackedAddrs.push_back(
3621             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3622                 .getReg(0));
3623         ++I;
3624       }
3625     }
3626   }
3627 }
3628 
3629 /// Convert from separate vaddr components to a single vector address register,
3630 /// and replace the remaining operands with $noreg.
3631 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3632                                      int DimIdx, int NumVAddrs) {
3633   const LLT S32 = LLT::scalar(32);
3634 
3635   SmallVector<Register, 8> AddrRegs;
3636   for (int I = 0; I != NumVAddrs; ++I) {
3637     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3638     if (SrcOp.isReg()) {
3639       AddrRegs.push_back(SrcOp.getReg());
3640       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3641     }
3642   }
3643 
3644   int NumAddrRegs = AddrRegs.size();
3645   if (NumAddrRegs != 1) {
3646     // Round up to 8 elements for v5-v7
3647     // FIXME: Missing intermediate sized register classes and instructions.
3648     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3649       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3650       auto Undef = B.buildUndef(S32);
3651       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3652       NumAddrRegs = RoundedNumRegs;
3653     }
3654 
3655     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3656     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3657   }
3658 
3659   for (int I = 1; I != NumVAddrs; ++I) {
3660     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3661     if (SrcOp.isReg())
3662       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3663   }
3664 }
3665 
3666 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3667 ///
3668 /// Depending on the subtarget, load/store with 16-bit element data need to be
3669 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3670 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3671 /// registers.
3672 ///
3673 /// We don't want to directly select image instructions just yet, but also want
3674 /// to exposes all register repacking to the legalizer/combiners. We also don't
3675 /// want a selected instrution entering RegBankSelect. In order to avoid
3676 /// defining a multitude of intermediate image instructions, directly hack on
3677 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3678 /// now unnecessary arguments with $noreg.
3679 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3680     MachineInstr &MI, MachineIRBuilder &B,
3681     GISelChangeObserver &Observer,
3682     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3683 
3684   const int NumDefs = MI.getNumExplicitDefs();
3685   bool IsTFE = NumDefs == 2;
3686   // We are only processing the operands of d16 image operations on subtargets
3687   // that use the unpacked register layout, or need to repack the TFE result.
3688 
3689   // TODO: Do we need to guard against already legalized intrinsics?
3690   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3691     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3692 
3693   MachineRegisterInfo *MRI = B.getMRI();
3694   const LLT S32 = LLT::scalar(32);
3695   const LLT S16 = LLT::scalar(16);
3696   const LLT V2S16 = LLT::vector(2, 16);
3697 
3698   // Index of first address argument
3699   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3700 
3701   int NumVAddrs, NumGradients;
3702   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3703   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3704     getDMaskIdx(BaseOpcode, NumDefs);
3705   unsigned DMask = 0;
3706 
3707   // Check for 16 bit addresses and pack if true.
3708   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3709   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3710   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3711   const bool IsG16 = GradTy == S16;
3712   const bool IsA16 = AddrTy == S16;
3713 
3714   int DMaskLanes = 0;
3715   if (!BaseOpcode->Atomic) {
3716     DMask = MI.getOperand(DMaskIdx).getImm();
3717     if (BaseOpcode->Gather4) {
3718       DMaskLanes = 4;
3719     } else if (DMask != 0) {
3720       DMaskLanes = countPopulation(DMask);
3721     } else if (!IsTFE && !BaseOpcode->Store) {
3722       // If dmask is 0, this is a no-op load. This can be eliminated.
3723       B.buildUndef(MI.getOperand(0));
3724       MI.eraseFromParent();
3725       return true;
3726     }
3727   }
3728 
3729   Observer.changingInstr(MI);
3730   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3731 
3732   unsigned NewOpcode = NumDefs == 0 ?
3733     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3734 
3735   // Track that we legalized this
3736   MI.setDesc(B.getTII().get(NewOpcode));
3737 
3738   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3739   // dmask to be at least 1 otherwise the instruction will fail
3740   if (IsTFE && DMask == 0) {
3741     DMask = 0x1;
3742     DMaskLanes = 1;
3743     MI.getOperand(DMaskIdx).setImm(DMask);
3744   }
3745 
3746   if (BaseOpcode->Atomic) {
3747     Register VData0 = MI.getOperand(2).getReg();
3748     LLT Ty = MRI->getType(VData0);
3749 
3750     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3751     if (Ty.isVector())
3752       return false;
3753 
3754     if (BaseOpcode->AtomicX2) {
3755       Register VData1 = MI.getOperand(3).getReg();
3756       // The two values are packed in one register.
3757       LLT PackedTy = LLT::vector(2, Ty);
3758       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3759       MI.getOperand(2).setReg(Concat.getReg(0));
3760       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3761     }
3762   }
3763 
3764   int CorrectedNumVAddrs = NumVAddrs;
3765 
3766   // Optimize _L to _LZ when _L is zero
3767   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3768         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3769     const ConstantFP *ConstantLod;
3770     const int LodIdx = AddrIdx + NumVAddrs - 1;
3771 
3772     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3773       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3774         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3775         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3776           LZMappingInfo->LZ, ImageDimIntr->Dim);
3777 
3778         // The starting indexes should remain in the same place.
3779         --NumVAddrs;
3780         --CorrectedNumVAddrs;
3781 
3782         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3783           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3784         MI.RemoveOperand(LodIdx);
3785       }
3786     }
3787   }
3788 
3789   // Optimize _mip away, when 'lod' is zero
3790   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3791     int64_t ConstantLod;
3792     const int LodIdx = AddrIdx + NumVAddrs - 1;
3793 
3794     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3795       if (ConstantLod == 0) {
3796         // TODO: Change intrinsic opcode and remove operand instead or replacing
3797         // it with 0, as the _L to _LZ handling is done above.
3798         MI.getOperand(LodIdx).ChangeToImmediate(0);
3799         --CorrectedNumVAddrs;
3800       }
3801     }
3802   }
3803 
3804   // Rewrite the addressing register layout before doing anything else.
3805   if (IsA16 || IsG16) {
3806     if (IsA16) {
3807       // Target must support the feature and gradients need to be 16 bit too
3808       if (!ST.hasA16() || !IsG16)
3809         return false;
3810     } else if (!ST.hasG16())
3811       return false;
3812 
3813     if (NumVAddrs > 1) {
3814       SmallVector<Register, 4> PackedRegs;
3815       // Don't compress addresses for G16
3816       const int PackEndIdx =
3817           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3818       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3819                                   PackEndIdx, NumGradients);
3820 
3821       if (!IsA16) {
3822         // Add uncompressed address
3823         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3824           int AddrReg = MI.getOperand(I).getReg();
3825           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3826           PackedRegs.push_back(AddrReg);
3827         }
3828       }
3829 
3830       // See also below in the non-a16 branch
3831       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3832 
3833       if (!UseNSA && PackedRegs.size() > 1) {
3834         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3835         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3836         PackedRegs[0] = Concat.getReg(0);
3837         PackedRegs.resize(1);
3838       }
3839 
3840       const int NumPacked = PackedRegs.size();
3841       for (int I = 0; I != NumVAddrs; ++I) {
3842         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3843         if (!SrcOp.isReg()) {
3844           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3845           continue;
3846         }
3847 
3848         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3849 
3850         if (I < NumPacked)
3851           SrcOp.setReg(PackedRegs[I]);
3852         else
3853           SrcOp.setReg(AMDGPU::NoRegister);
3854       }
3855     }
3856   } else {
3857     // If the register allocator cannot place the address registers contiguously
3858     // without introducing moves, then using the non-sequential address encoding
3859     // is always preferable, since it saves VALU instructions and is usually a
3860     // wash in terms of code size or even better.
3861     //
3862     // However, we currently have no way of hinting to the register allocator
3863     // that MIMG addresses should be placed contiguously when it is possible to
3864     // do so, so force non-NSA for the common 2-address case as a heuristic.
3865     //
3866     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3867     // allocation when possible.
3868     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3869 
3870     if (!UseNSA && NumVAddrs > 1)
3871       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3872   }
3873 
3874   int Flags = 0;
3875   if (IsA16)
3876     Flags |= 1;
3877   if (IsG16)
3878     Flags |= 2;
3879   MI.addOperand(MachineOperand::CreateImm(Flags));
3880 
3881   if (BaseOpcode->Store) { // No TFE for stores?
3882     // TODO: Handle dmask trim
3883     Register VData = MI.getOperand(1).getReg();
3884     LLT Ty = MRI->getType(VData);
3885     if (!Ty.isVector() || Ty.getElementType() != S16)
3886       return true;
3887 
3888     Register RepackedReg = handleD16VData(B, *MRI, VData);
3889     if (RepackedReg != VData) {
3890       MI.getOperand(1).setReg(RepackedReg);
3891     }
3892 
3893     return true;
3894   }
3895 
3896   Register DstReg = MI.getOperand(0).getReg();
3897   LLT Ty = MRI->getType(DstReg);
3898   const LLT EltTy = Ty.getScalarType();
3899   const bool IsD16 = Ty.getScalarType() == S16;
3900   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3901 
3902   // Confirm that the return type is large enough for the dmask specified
3903   if (NumElts < DMaskLanes)
3904     return false;
3905 
3906   if (NumElts > 4 || DMaskLanes > 4)
3907     return false;
3908 
3909   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3910   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3911 
3912   // The raw dword aligned data component of the load. The only legal cases
3913   // where this matters should be when using the packed D16 format, for
3914   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3915   LLT RoundedTy;
3916 
3917   // S32 vector to to cover all data, plus TFE result element.
3918   LLT TFETy;
3919 
3920   // Register type to use for each loaded component. Will be S32 or V2S16.
3921   LLT RegTy;
3922 
3923   if (IsD16 && ST.hasUnpackedD16VMem()) {
3924     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3925     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3926     RegTy = S32;
3927   } else {
3928     unsigned EltSize = EltTy.getSizeInBits();
3929     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3930     unsigned RoundedSize = 32 * RoundedElts;
3931     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3932     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3933     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3934   }
3935 
3936   // The return type does not need adjustment.
3937   // TODO: Should we change s16 case to s32 or <2 x s16>?
3938   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3939     return true;
3940 
3941   Register Dst1Reg;
3942 
3943   // Insert after the instruction.
3944   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3945 
3946   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3947   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3948   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3949   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3950 
3951   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3952 
3953   MI.getOperand(0).setReg(NewResultReg);
3954 
3955   // In the IR, TFE is supposed to be used with a 2 element struct return
3956   // type. The intruction really returns these two values in one contiguous
3957   // register, with one additional dword beyond the loaded data. Rewrite the
3958   // return type to use a single register result.
3959 
3960   if (IsTFE) {
3961     Dst1Reg = MI.getOperand(1).getReg();
3962     if (MRI->getType(Dst1Reg) != S32)
3963       return false;
3964 
3965     // TODO: Make sure the TFE operand bit is set.
3966     MI.RemoveOperand(1);
3967 
3968     // Handle the easy case that requires no repack instructions.
3969     if (Ty == S32) {
3970       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3971       return true;
3972     }
3973   }
3974 
3975   // Now figure out how to copy the new result register back into the old
3976   // result.
3977   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3978 
3979   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3980 
3981   if (ResultNumRegs == 1) {
3982     assert(!IsTFE);
3983     ResultRegs[0] = NewResultReg;
3984   } else {
3985     // We have to repack into a new vector of some kind.
3986     for (int I = 0; I != NumDataRegs; ++I)
3987       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3988     B.buildUnmerge(ResultRegs, NewResultReg);
3989 
3990     // Drop the final TFE element to get the data part. The TFE result is
3991     // directly written to the right place already.
3992     if (IsTFE)
3993       ResultRegs.resize(NumDataRegs);
3994   }
3995 
3996   // For an s16 scalar result, we form an s32 result with a truncate regardless
3997   // of packed vs. unpacked.
3998   if (IsD16 && !Ty.isVector()) {
3999     B.buildTrunc(DstReg, ResultRegs[0]);
4000     return true;
4001   }
4002 
4003   // Avoid a build/concat_vector of 1 entry.
4004   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4005     B.buildBitcast(DstReg, ResultRegs[0]);
4006     return true;
4007   }
4008 
4009   assert(Ty.isVector());
4010 
4011   if (IsD16) {
4012     // For packed D16 results with TFE enabled, all the data components are
4013     // S32. Cast back to the expected type.
4014     //
4015     // TODO: We don't really need to use load s32 elements. We would only need one
4016     // cast for the TFE result if a multiple of v2s16 was used.
4017     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4018       for (Register &Reg : ResultRegs)
4019         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4020     } else if (ST.hasUnpackedD16VMem()) {
4021       for (Register &Reg : ResultRegs)
4022         Reg = B.buildTrunc(S16, Reg).getReg(0);
4023     }
4024   }
4025 
4026   auto padWithUndef = [&](LLT Ty, int NumElts) {
4027     if (NumElts == 0)
4028       return;
4029     Register Undef = B.buildUndef(Ty).getReg(0);
4030     for (int I = 0; I != NumElts; ++I)
4031       ResultRegs.push_back(Undef);
4032   };
4033 
4034   // Pad out any elements eliminated due to the dmask.
4035   LLT ResTy = MRI->getType(ResultRegs[0]);
4036   if (!ResTy.isVector()) {
4037     padWithUndef(ResTy, NumElts - ResultRegs.size());
4038     B.buildBuildVector(DstReg, ResultRegs);
4039     return true;
4040   }
4041 
4042   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4043   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4044 
4045   // Deal with the one annoying legal case.
4046   const LLT V3S16 = LLT::vector(3, 16);
4047   if (Ty == V3S16) {
4048     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4049     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4050     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4051     return true;
4052   }
4053 
4054   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4055   B.buildConcatVectors(DstReg, ResultRegs);
4056   return true;
4057 }
4058 
4059 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4060   MachineInstr &MI, MachineIRBuilder &B,
4061   GISelChangeObserver &Observer) const {
4062   Register Dst = MI.getOperand(0).getReg();
4063   LLT Ty = B.getMRI()->getType(Dst);
4064   unsigned Size = Ty.getSizeInBits();
4065   MachineFunction &MF = B.getMF();
4066 
4067   Observer.changingInstr(MI);
4068 
4069   // FIXME: We don't really need this intermediate instruction. The intrinsic
4070   // should be fixed to have a memory operand. Since it's readnone, we're not
4071   // allowed to add one.
4072   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4073   MI.RemoveOperand(1); // Remove intrinsic ID
4074 
4075   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4076   // TODO: Should this use datalayout alignment?
4077   const unsigned MemSize = (Size + 7) / 8;
4078   const Align MemAlign(4);
4079   MachineMemOperand *MMO = MF.getMachineMemOperand(
4080       MachinePointerInfo(),
4081       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4082           MachineMemOperand::MOInvariant,
4083       MemSize, MemAlign);
4084   MI.addMemOperand(MF, MMO);
4085 
4086   // There are no 96-bit result scalar loads, but widening to 128-bit should
4087   // always be legal. We may need to restore this to a 96-bit result if it turns
4088   // out this needs to be converted to a vector load during RegBankSelect.
4089   if (!isPowerOf2_32(Size)) {
4090     LegalizerHelper Helper(MF, *this, Observer, B);
4091 
4092     if (Ty.isVector())
4093       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4094     else
4095       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4096   }
4097 
4098   Observer.changedInstr(MI);
4099   return true;
4100 }
4101 
4102 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4103                                                 MachineRegisterInfo &MRI,
4104                                                 MachineIRBuilder &B) const {
4105   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4106   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4107       !ST.isTrapHandlerEnabled()) {
4108     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4109   } else {
4110     // Pass queue pointer to trap handler as input, and insert trap instruction
4111     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4112     const ArgDescriptor *Arg =
4113         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4114     if (!Arg)
4115       return false;
4116     MachineRegisterInfo &MRI = *B.getMRI();
4117     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4118     Register LiveIn = getLiveInRegister(
4119         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4120         /*InsertLiveInCopy=*/false);
4121     if (!loadInputValue(LiveIn, B, Arg))
4122       return false;
4123     B.buildCopy(SGPR01, LiveIn);
4124     B.buildInstr(AMDGPU::S_TRAP)
4125         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4126         .addReg(SGPR01, RegState::Implicit);
4127   }
4128 
4129   MI.eraseFromParent();
4130   return true;
4131 }
4132 
4133 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4134     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4135   // Is non-HSA path or trap-handler disabled? then, report a warning
4136   // accordingly
4137   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4138       !ST.isTrapHandlerEnabled()) {
4139     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4140                                      "debugtrap handler not supported",
4141                                      MI.getDebugLoc(), DS_Warning);
4142     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4143     Ctx.diagnose(NoTrap);
4144   } else {
4145     // Insert debug-trap instruction
4146     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4147   }
4148 
4149   MI.eraseFromParent();
4150   return true;
4151 }
4152 
4153 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4154                                             MachineIRBuilder &B,
4155                                             GISelChangeObserver &Observer) const {
4156   MachineRegisterInfo &MRI = *B.getMRI();
4157 
4158   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4159   auto IntrID = MI.getIntrinsicID();
4160   switch (IntrID) {
4161   case Intrinsic::amdgcn_if:
4162   case Intrinsic::amdgcn_else: {
4163     MachineInstr *Br = nullptr;
4164     MachineBasicBlock *UncondBrTarget = nullptr;
4165     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4166       const SIRegisterInfo *TRI
4167         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4168 
4169       Register Def = MI.getOperand(1).getReg();
4170       Register Use = MI.getOperand(3).getReg();
4171 
4172       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4173       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4174       if (IntrID == Intrinsic::amdgcn_if) {
4175         B.buildInstr(AMDGPU::SI_IF)
4176           .addDef(Def)
4177           .addUse(Use)
4178           .addMBB(UncondBrTarget);
4179       } else {
4180         B.buildInstr(AMDGPU::SI_ELSE)
4181           .addDef(Def)
4182           .addUse(Use)
4183           .addMBB(UncondBrTarget)
4184           .addImm(0);
4185       }
4186 
4187       if (Br) {
4188         Br->getOperand(0).setMBB(CondBrTarget);
4189       } else {
4190         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4191         // since we're swapping branch targets it needs to be reinserted.
4192         // FIXME: IRTranslator should probably not do this
4193         B.buildBr(*CondBrTarget);
4194       }
4195 
4196       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4197       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4198       MI.eraseFromParent();
4199       BrCond->eraseFromParent();
4200       return true;
4201     }
4202 
4203     return false;
4204   }
4205   case Intrinsic::amdgcn_loop: {
4206     MachineInstr *Br = nullptr;
4207     MachineBasicBlock *UncondBrTarget = nullptr;
4208     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4209       const SIRegisterInfo *TRI
4210         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4211 
4212       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4213       Register Reg = MI.getOperand(2).getReg();
4214 
4215       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4216       B.buildInstr(AMDGPU::SI_LOOP)
4217         .addUse(Reg)
4218         .addMBB(UncondBrTarget);
4219 
4220       if (Br)
4221         Br->getOperand(0).setMBB(CondBrTarget);
4222       else
4223         B.buildBr(*CondBrTarget);
4224 
4225       MI.eraseFromParent();
4226       BrCond->eraseFromParent();
4227       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4228       return true;
4229     }
4230 
4231     return false;
4232   }
4233   case Intrinsic::amdgcn_kernarg_segment_ptr:
4234     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4235       // This only makes sense to call in a kernel, so just lower to null.
4236       B.buildConstant(MI.getOperand(0).getReg(), 0);
4237       MI.eraseFromParent();
4238       return true;
4239     }
4240 
4241     return legalizePreloadedArgIntrin(
4242       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4243   case Intrinsic::amdgcn_implicitarg_ptr:
4244     return legalizeImplicitArgPtr(MI, MRI, B);
4245   case Intrinsic::amdgcn_workitem_id_x:
4246     return legalizePreloadedArgIntrin(MI, MRI, B,
4247                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4248   case Intrinsic::amdgcn_workitem_id_y:
4249     return legalizePreloadedArgIntrin(MI, MRI, B,
4250                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4251   case Intrinsic::amdgcn_workitem_id_z:
4252     return legalizePreloadedArgIntrin(MI, MRI, B,
4253                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4254   case Intrinsic::amdgcn_workgroup_id_x:
4255     return legalizePreloadedArgIntrin(MI, MRI, B,
4256                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4257   case Intrinsic::amdgcn_workgroup_id_y:
4258     return legalizePreloadedArgIntrin(MI, MRI, B,
4259                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4260   case Intrinsic::amdgcn_workgroup_id_z:
4261     return legalizePreloadedArgIntrin(MI, MRI, B,
4262                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4263   case Intrinsic::amdgcn_dispatch_ptr:
4264     return legalizePreloadedArgIntrin(MI, MRI, B,
4265                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4266   case Intrinsic::amdgcn_queue_ptr:
4267     return legalizePreloadedArgIntrin(MI, MRI, B,
4268                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4269   case Intrinsic::amdgcn_implicit_buffer_ptr:
4270     return legalizePreloadedArgIntrin(
4271       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4272   case Intrinsic::amdgcn_dispatch_id:
4273     return legalizePreloadedArgIntrin(MI, MRI, B,
4274                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4275   case Intrinsic::amdgcn_fdiv_fast:
4276     return legalizeFDIVFastIntrin(MI, MRI, B);
4277   case Intrinsic::amdgcn_is_shared:
4278     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4279   case Intrinsic::amdgcn_is_private:
4280     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4281   case Intrinsic::amdgcn_wavefrontsize: {
4282     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4283     MI.eraseFromParent();
4284     return true;
4285   }
4286   case Intrinsic::amdgcn_s_buffer_load:
4287     return legalizeSBufferLoad(MI, B, Observer);
4288   case Intrinsic::amdgcn_raw_buffer_store:
4289   case Intrinsic::amdgcn_struct_buffer_store:
4290     return legalizeBufferStore(MI, MRI, B, false, false);
4291   case Intrinsic::amdgcn_raw_buffer_store_format:
4292   case Intrinsic::amdgcn_struct_buffer_store_format:
4293     return legalizeBufferStore(MI, MRI, B, false, true);
4294   case Intrinsic::amdgcn_raw_tbuffer_store:
4295   case Intrinsic::amdgcn_struct_tbuffer_store:
4296     return legalizeBufferStore(MI, MRI, B, true, true);
4297   case Intrinsic::amdgcn_raw_buffer_load:
4298   case Intrinsic::amdgcn_struct_buffer_load:
4299     return legalizeBufferLoad(MI, MRI, B, false, false);
4300   case Intrinsic::amdgcn_raw_buffer_load_format:
4301   case Intrinsic::amdgcn_struct_buffer_load_format:
4302     return legalizeBufferLoad(MI, MRI, B, true, false);
4303   case Intrinsic::amdgcn_raw_tbuffer_load:
4304   case Intrinsic::amdgcn_struct_tbuffer_load:
4305     return legalizeBufferLoad(MI, MRI, B, true, true);
4306   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4307   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4308   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4309   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4310   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4311   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4312   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4313   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4314   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4315   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4316   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4317   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4318   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4319   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4320   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4321   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4322   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4323   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4324   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4325   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4326   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4327   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4328   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4329   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4330   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4331   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4332     return legalizeBufferAtomic(MI, B, IntrID);
4333   case Intrinsic::amdgcn_atomic_inc:
4334     return legalizeAtomicIncDec(MI, B, true);
4335   case Intrinsic::amdgcn_atomic_dec:
4336     return legalizeAtomicIncDec(MI, B, false);
4337   case Intrinsic::trap:
4338     return legalizeTrapIntrinsic(MI, MRI, B);
4339   case Intrinsic::debugtrap:
4340     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4341   default: {
4342     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4343             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4344       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4345     return true;
4346   }
4347   }
4348 
4349   return true;
4350 }
4351