1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Round the number of elements to the next power of two elements
40 static LLT getPow2VectorType(LLT Ty) {
41   unsigned NElts = Ty.getNumElements();
42   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
43   return Ty.changeNumElements(Pow2NElts);
44 }
45 
46 // Round the number of bits to the next power of two bits
47 static LLT getPow2ScalarType(LLT Ty) {
48   unsigned Bits = Ty.getSizeInBits();
49   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
50   return LLT::scalar(Pow2Bits);
51 }
52 
53 static LegalityPredicate isMultiple32(unsigned TypeIdx,
54                                       unsigned MaxSize = 1024) {
55   return [=](const LegalityQuery &Query) {
56     const LLT Ty = Query.Types[TypeIdx];
57     const LLT EltTy = Ty.getScalarType();
58     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
59   };
60 }
61 
62 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
63   return [=](const LegalityQuery &Query) {
64     const LLT Ty = Query.Types[TypeIdx];
65     return Ty.isVector() &&
66            Ty.getNumElements() % 2 != 0 &&
67            Ty.getElementType().getSizeInBits() < 32 &&
68            Ty.getSizeInBits() % 32 != 0;
69   };
70 }
71 
72 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
73   return [=](const LegalityQuery &Query) {
74     const LLT Ty = Query.Types[TypeIdx];
75     const LLT EltTy = Ty.getScalarType();
76     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
77   };
78 }
79 
80 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     const LLT EltTy = Ty.getElementType();
84     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
85   };
86 }
87 
88 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getElementType();
92     unsigned Size = Ty.getSizeInBits();
93     unsigned Pieces = (Size + 63) / 64;
94     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
95     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
96   };
97 }
98 
99 // Increase the number of vector elements to reach the next multiple of 32-bit
100 // type.
101 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104 
105     const LLT EltTy = Ty.getElementType();
106     const int Size = Ty.getSizeInBits();
107     const int EltSize = EltTy.getSizeInBits();
108     const int NextMul32 = (Size + 31) / 32;
109 
110     assert(EltSize < 32);
111 
112     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
113     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
114   };
115 }
116 
117 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
118   return [=](const LegalityQuery &Query) {
119     const LLT Ty = Query.Types[TypeIdx];
120     unsigned Size = Ty.getSizeInBits();
121 
122     LLT CoercedTy;
123     if (Size < 32) {
124       // <2 x s8> -> s16
125       assert(Size == 16);
126       CoercedTy = LLT::scalar(16);
127     } else
128       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
129 
130     return std::make_pair(TypeIdx, CoercedTy);
131   };
132 }
133 
134 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
135   return [=](const LegalityQuery &Query) {
136     const LLT QueryTy = Query.Types[TypeIdx];
137     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
138   };
139 }
140 
141 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
142   return [=](const LegalityQuery &Query) {
143     const LLT QueryTy = Query.Types[TypeIdx];
144     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
145   };
146 }
147 
148 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
149   return [=](const LegalityQuery &Query) {
150     const LLT QueryTy = Query.Types[TypeIdx];
151     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
152   };
153 }
154 
155 static bool isRegisterSize(unsigned Size) {
156   return Size % 32 == 0 && Size <= 1024;
157 }
158 
159 static bool isRegisterVectorElementType(LLT EltTy) {
160   const int EltSize = EltTy.getSizeInBits();
161   return EltSize == 16 || EltSize % 32 == 0;
162 }
163 
164 static bool isRegisterVectorType(LLT Ty) {
165   const int EltSize = Ty.getElementType().getSizeInBits();
166   return EltSize == 32 || EltSize == 64 ||
167          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
168          EltSize == 128 || EltSize == 256;
169 }
170 
171 static bool isRegisterType(LLT Ty) {
172   if (!isRegisterSize(Ty.getSizeInBits()))
173     return false;
174 
175   if (Ty.isVector())
176     return isRegisterVectorType(Ty);
177 
178   return true;
179 }
180 
181 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
182 // v2s16.
183 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
184   return [=](const LegalityQuery &Query) {
185     return isRegisterType(Query.Types[TypeIdx]);
186   };
187 }
188 
189 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
190   return [=](const LegalityQuery &Query) {
191     const LLT QueryTy = Query.Types[TypeIdx];
192     if (!QueryTy.isVector())
193       return false;
194     const LLT EltTy = QueryTy.getElementType();
195     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
196   };
197 }
198 
199 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
200   return [=](const LegalityQuery &Query) {
201     const LLT Ty = Query.Types[TypeIdx];
202     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
203            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
204   };
205 }
206 
207 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
208 // handle some operations by just promoting the register during
209 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
210 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
211                                     bool IsLoad) {
212   switch (AS) {
213   case AMDGPUAS::PRIVATE_ADDRESS:
214     // FIXME: Private element size.
215     return 32;
216   case AMDGPUAS::LOCAL_ADDRESS:
217     return ST.useDS128() ? 128 : 64;
218   case AMDGPUAS::GLOBAL_ADDRESS:
219   case AMDGPUAS::CONSTANT_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
221     // Treat constant and global as identical. SMRD loads are sometimes usable for
222     // global loads (ideally constant address space should be eliminated)
223     // depending on the context. Legality cannot be context dependent, but
224     // RegBankSelect can split the load as necessary depending on the pointer
225     // register bank/uniformity and if the memory is invariant or not written in a
226     // kernel.
227     return IsLoad ? 512 : 128;
228   default:
229     // Flat addresses may contextually need to be split to 32-bit parts if they
230     // may alias scratch depending on the subtarget.
231     return 128;
232   }
233 }
234 
235 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
236                                  const LegalityQuery &Query,
237                                  unsigned Opcode) {
238   const LLT Ty = Query.Types[0];
239 
240   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
241   const bool IsLoad = Opcode != AMDGPU::G_STORE;
242 
243   unsigned RegSize = Ty.getSizeInBits();
244   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
245   unsigned Align = Query.MMODescrs[0].AlignInBits;
246   unsigned AS = Query.Types[1].getAddressSpace();
247 
248   // All of these need to be custom lowered to cast the pointer operand.
249   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
250     return false;
251 
252   // TODO: We should be able to widen loads if the alignment is high enough, but
253   // we also need to modify the memory access size.
254 #if 0
255   // Accept widening loads based on alignment.
256   if (IsLoad && MemSize < Size)
257     MemSize = std::max(MemSize, Align);
258 #endif
259 
260   // Only 1-byte and 2-byte to 32-bit extloads are valid.
261   if (MemSize != RegSize && RegSize != 32)
262     return false;
263 
264   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
265     return false;
266 
267   switch (MemSize) {
268   case 8:
269   case 16:
270   case 32:
271   case 64:
272   case 128:
273     break;
274   case 96:
275     if (!ST.hasDwordx3LoadStores())
276       return false;
277     break;
278   case 256:
279   case 512:
280     // These may contextually need to be broken down.
281     break;
282   default:
283     return false;
284   }
285 
286   assert(RegSize >= MemSize);
287 
288   if (Align < MemSize) {
289     const SITargetLowering *TLI = ST.getTargetLowering();
290     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
291       return false;
292   }
293 
294   return true;
295 }
296 
297 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
298                              unsigned Opcode) {
299   const LLT Ty = Query.Types[0];
300   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode);
301 }
302 
303 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
304                                          const GCNTargetMachine &TM)
305   :  ST(ST_) {
306   using namespace TargetOpcode;
307 
308   auto GetAddrSpacePtr = [&TM](unsigned AS) {
309     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
310   };
311 
312   const LLT S1 = LLT::scalar(1);
313   const LLT S16 = LLT::scalar(16);
314   const LLT S32 = LLT::scalar(32);
315   const LLT S64 = LLT::scalar(64);
316   const LLT S128 = LLT::scalar(128);
317   const LLT S256 = LLT::scalar(256);
318   const LLT S512 = LLT::scalar(512);
319   const LLT S1024 = LLT::scalar(1024);
320 
321   const LLT V2S16 = LLT::vector(2, 16);
322   const LLT V4S16 = LLT::vector(4, 16);
323 
324   const LLT V2S32 = LLT::vector(2, 32);
325   const LLT V3S32 = LLT::vector(3, 32);
326   const LLT V4S32 = LLT::vector(4, 32);
327   const LLT V5S32 = LLT::vector(5, 32);
328   const LLT V6S32 = LLT::vector(6, 32);
329   const LLT V7S32 = LLT::vector(7, 32);
330   const LLT V8S32 = LLT::vector(8, 32);
331   const LLT V9S32 = LLT::vector(9, 32);
332   const LLT V10S32 = LLT::vector(10, 32);
333   const LLT V11S32 = LLT::vector(11, 32);
334   const LLT V12S32 = LLT::vector(12, 32);
335   const LLT V13S32 = LLT::vector(13, 32);
336   const LLT V14S32 = LLT::vector(14, 32);
337   const LLT V15S32 = LLT::vector(15, 32);
338   const LLT V16S32 = LLT::vector(16, 32);
339   const LLT V32S32 = LLT::vector(32, 32);
340 
341   const LLT V2S64 = LLT::vector(2, 64);
342   const LLT V3S64 = LLT::vector(3, 64);
343   const LLT V4S64 = LLT::vector(4, 64);
344   const LLT V5S64 = LLT::vector(5, 64);
345   const LLT V6S64 = LLT::vector(6, 64);
346   const LLT V7S64 = LLT::vector(7, 64);
347   const LLT V8S64 = LLT::vector(8, 64);
348   const LLT V16S64 = LLT::vector(16, 64);
349 
350   std::initializer_list<LLT> AllS32Vectors =
351     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
352      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
353   std::initializer_list<LLT> AllS64Vectors =
354     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
355 
356   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
357   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
358   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
359   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
360   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
361   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
362   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
363 
364   const LLT CodePtr = FlatPtr;
365 
366   const std::initializer_list<LLT> AddrSpaces64 = {
367     GlobalPtr, ConstantPtr, FlatPtr
368   };
369 
370   const std::initializer_list<LLT> AddrSpaces32 = {
371     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
372   };
373 
374   const std::initializer_list<LLT> FPTypesBase = {
375     S32, S64
376   };
377 
378   const std::initializer_list<LLT> FPTypes16 = {
379     S32, S64, S16
380   };
381 
382   const std::initializer_list<LLT> FPTypesPK16 = {
383     S32, S64, S16, V2S16
384   };
385 
386   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
387 
388   setAction({G_BRCOND, S1}, Legal); // VCC branches
389   setAction({G_BRCOND, S32}, Legal); // SCC branches
390 
391   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
392   // elements for v3s16
393   getActionDefinitionsBuilder(G_PHI)
394     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
395     .legalFor(AllS32Vectors)
396     .legalFor(AllS64Vectors)
397     .legalFor(AddrSpaces64)
398     .legalFor(AddrSpaces32)
399     .clampScalar(0, S32, S256)
400     .widenScalarToNextPow2(0, 32)
401     .clampMaxNumElements(0, S32, 16)
402     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
403     .legalIf(isPointer(0));
404 
405   if (ST.hasVOP3PInsts()) {
406     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
407       .legalFor({S32, S16, V2S16})
408       .clampScalar(0, S16, S32)
409       .clampMaxNumElements(0, S16, 2)
410       .scalarize(0)
411       .widenScalarToNextPow2(0, 32);
412   } else if (ST.has16BitInsts()) {
413     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
414       .legalFor({S32, S16})
415       .clampScalar(0, S16, S32)
416       .scalarize(0)
417       .widenScalarToNextPow2(0, 32);
418   } else {
419     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
420       .legalFor({S32})
421       .clampScalar(0, S32, S32)
422       .scalarize(0);
423   }
424 
425   // FIXME: Not really legal. Placeholder for custom lowering.
426   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
427     .customFor({S32, S64})
428     .clampScalar(0, S32, S64)
429     .widenScalarToNextPow2(0, 32)
430     .scalarize(0);
431 
432   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
433     .legalFor({S32})
434     .clampScalar(0, S32, S32)
435     .scalarize(0);
436 
437   // Report legal for any types we can handle anywhere. For the cases only legal
438   // on the SALU, RegBankSelect will be able to re-legalize.
439   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
440     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
441     .clampScalar(0, S32, S64)
442     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
443     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
444     .widenScalarToNextPow2(0)
445     .scalarize(0);
446 
447   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
448                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
449     .legalFor({{S32, S1}, {S32, S32}})
450     .minScalar(0, S32)
451     // TODO: .scalarize(0)
452     .lower();
453 
454   getActionDefinitionsBuilder(G_BITCAST)
455     // Don't worry about the size constraint.
456     .legalIf(all(isRegisterType(0), isRegisterType(1)))
457     .lower();
458 
459 
460   getActionDefinitionsBuilder(G_CONSTANT)
461     .legalFor({S1, S32, S64, S16, GlobalPtr,
462                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
463     .clampScalar(0, S32, S64)
464     .widenScalarToNextPow2(0)
465     .legalIf(isPointer(0));
466 
467   getActionDefinitionsBuilder(G_FCONSTANT)
468     .legalFor({S32, S64, S16})
469     .clampScalar(0, S16, S64);
470 
471   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
472       .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
473                  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
474       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
475       .clampScalarOrElt(0, S32, S1024)
476       .legalIf(isMultiple32(0))
477       .widenScalarToNextPow2(0, 32)
478       .clampMaxNumElements(0, S32, 16);
479 
480   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
481 
482   // If the amount is divergent, we have to do a wave reduction to get the
483   // maximum value, so this is expanded during RegBankSelect.
484   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
485     .legalFor({{PrivatePtr, S32}});
486 
487   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
488     .unsupportedFor({PrivatePtr})
489     .custom();
490   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
491 
492   auto &FPOpActions = getActionDefinitionsBuilder(
493     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
494     .legalFor({S32, S64});
495   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
496     .customFor({S32, S64});
497   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
498     .customFor({S32, S64});
499 
500   if (ST.has16BitInsts()) {
501     if (ST.hasVOP3PInsts())
502       FPOpActions.legalFor({S16, V2S16});
503     else
504       FPOpActions.legalFor({S16});
505 
506     TrigActions.customFor({S16});
507     FDIVActions.customFor({S16});
508   }
509 
510   auto &MinNumMaxNum = getActionDefinitionsBuilder({
511       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
512 
513   if (ST.hasVOP3PInsts()) {
514     MinNumMaxNum.customFor(FPTypesPK16)
515       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516       .clampMaxNumElements(0, S16, 2)
517       .clampScalar(0, S16, S64)
518       .scalarize(0);
519   } else if (ST.has16BitInsts()) {
520     MinNumMaxNum.customFor(FPTypes16)
521       .clampScalar(0, S16, S64)
522       .scalarize(0);
523   } else {
524     MinNumMaxNum.customFor(FPTypesBase)
525       .clampScalar(0, S32, S64)
526       .scalarize(0);
527   }
528 
529   if (ST.hasVOP3PInsts())
530     FPOpActions.clampMaxNumElements(0, S16, 2);
531 
532   FPOpActions
533     .scalarize(0)
534     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
535 
536   TrigActions
537     .scalarize(0)
538     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
539 
540   FDIVActions
541     .scalarize(0)
542     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
543 
544   getActionDefinitionsBuilder({G_FNEG, G_FABS})
545     .legalFor(FPTypesPK16)
546     .clampMaxNumElements(0, S16, 2)
547     .scalarize(0)
548     .clampScalar(0, S16, S64);
549 
550   if (ST.has16BitInsts()) {
551     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
552       .legalFor({S32, S64, S16})
553       .scalarize(0)
554       .clampScalar(0, S16, S64);
555   } else {
556     getActionDefinitionsBuilder(G_FSQRT)
557       .legalFor({S32, S64})
558       .scalarize(0)
559       .clampScalar(0, S32, S64);
560 
561     if (ST.hasFractBug()) {
562       getActionDefinitionsBuilder(G_FFLOOR)
563         .customFor({S64})
564         .legalFor({S32, S64})
565         .scalarize(0)
566         .clampScalar(0, S32, S64);
567     } else {
568       getActionDefinitionsBuilder(G_FFLOOR)
569         .legalFor({S32, S64})
570         .scalarize(0)
571         .clampScalar(0, S32, S64);
572     }
573   }
574 
575   getActionDefinitionsBuilder(G_FPTRUNC)
576     .legalFor({{S32, S64}, {S16, S32}})
577     .scalarize(0)
578     .lower();
579 
580   getActionDefinitionsBuilder(G_FPEXT)
581     .legalFor({{S64, S32}, {S32, S16}})
582     .lowerFor({{S64, S16}}) // FIXME: Implement
583     .scalarize(0);
584 
585   getActionDefinitionsBuilder(G_FSUB)
586       // Use actual fsub instruction
587       .legalFor({S32})
588       // Must use fadd + fneg
589       .lowerFor({S64, S16, V2S16})
590       .scalarize(0)
591       .clampScalar(0, S32, S64);
592 
593   // Whether this is legal depends on the floating point mode for the function.
594   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
595   if (ST.hasMadF16())
596     FMad.customFor({S32, S16});
597   else
598     FMad.customFor({S32});
599   FMad.scalarize(0)
600       .lower();
601 
602   // TODO: Do we need to clamp maximum bitwidth?
603   getActionDefinitionsBuilder(G_TRUNC)
604     .legalIf(isScalar(0))
605     .legalFor({{V2S16, V2S32}})
606     .clampMaxNumElements(0, S16, 2)
607     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
608     // situations (like an invalid implicit use), we don't want to infinite loop
609     // in the legalizer.
610     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
611     .alwaysLegal();
612 
613   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
614     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
615                {S32, S1}, {S64, S1}, {S16, S1}})
616     .scalarize(0)
617     .clampScalar(0, S32, S64)
618     .widenScalarToNextPow2(1, 32);
619 
620   // TODO: Split s1->s64 during regbankselect for VALU.
621   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
622     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
623     .lowerFor({{S32, S64}})
624     .lowerIf(typeIs(1, S1))
625     .customFor({{S64, S64}});
626   if (ST.has16BitInsts())
627     IToFP.legalFor({{S16, S16}});
628   IToFP.clampScalar(1, S32, S64)
629        .scalarize(0)
630        .widenScalarToNextPow2(1);
631 
632   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
633     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
634     .customFor({{S64, S64}});
635   if (ST.has16BitInsts())
636     FPToI.legalFor({{S16, S16}});
637   else
638     FPToI.minScalar(1, S32);
639 
640   FPToI.minScalar(0, S32)
641        .scalarize(0)
642        .lower();
643 
644   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
645     .scalarize(0)
646     .lower();
647 
648   if (ST.has16BitInsts()) {
649     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
650       .legalFor({S16, S32, S64})
651       .clampScalar(0, S16, S64)
652       .scalarize(0);
653   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
654     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
655       .legalFor({S32, S64})
656       .clampScalar(0, S32, S64)
657       .scalarize(0);
658   } else {
659     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
660       .legalFor({S32})
661       .customFor({S64})
662       .clampScalar(0, S32, S64)
663       .scalarize(0);
664   }
665 
666   // FIXME: Clamp offset operand.
667   getActionDefinitionsBuilder(G_PTR_ADD)
668     .legalIf(isPointer(0))
669     .scalarize(0);
670 
671   getActionDefinitionsBuilder(G_PTRMASK)
672     .legalIf(typeInSet(1, {S64, S32}))
673     .minScalar(1, S32)
674     .maxScalarIf(sizeIs(0, 32), 1, S32)
675     .maxScalarIf(sizeIs(0, 64), 1, S64)
676     .scalarize(0);
677 
678   auto &CmpBuilder =
679     getActionDefinitionsBuilder(G_ICMP)
680     // The compare output type differs based on the register bank of the output,
681     // so make both s1 and s32 legal.
682     //
683     // Scalar compares producing output in scc will be promoted to s32, as that
684     // is the allocatable register type that will be needed for the copy from
685     // scc. This will be promoted during RegBankSelect, and we assume something
686     // before that won't try to use s32 result types.
687     //
688     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
689     // bank.
690     .legalForCartesianProduct(
691       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
692     .legalForCartesianProduct(
693       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
694   if (ST.has16BitInsts()) {
695     CmpBuilder.legalFor({{S1, S16}});
696   }
697 
698   CmpBuilder
699     .widenScalarToNextPow2(1)
700     .clampScalar(1, S32, S64)
701     .scalarize(0)
702     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
703 
704   getActionDefinitionsBuilder(G_FCMP)
705     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
706     .widenScalarToNextPow2(1)
707     .clampScalar(1, S32, S64)
708     .scalarize(0);
709 
710   // FIXME: fpow has a selection pattern that should move to custom lowering.
711   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
712   if (ST.has16BitInsts())
713     Exp2Ops.legalFor({S32, S16});
714   else
715     Exp2Ops.legalFor({S32});
716   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
717   Exp2Ops.scalarize(0);
718 
719   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
720   if (ST.has16BitInsts())
721     ExpOps.customFor({{S32}, {S16}});
722   else
723     ExpOps.customFor({S32});
724   ExpOps.clampScalar(0, MinScalarFPTy, S32)
725         .scalarize(0);
726 
727   // The 64-bit versions produce 32-bit results, but only on the SALU.
728   getActionDefinitionsBuilder(G_CTPOP)
729     .legalFor({{S32, S32}, {S32, S64}})
730     .clampScalar(0, S32, S32)
731     .clampScalar(1, S32, S64)
732     .scalarize(0)
733     .widenScalarToNextPow2(0, 32)
734     .widenScalarToNextPow2(1, 32);
735 
736   // The hardware instructions return a different result on 0 than the generic
737   // instructions expect. The hardware produces -1, but these produce the
738   // bitwidth.
739   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
740     .scalarize(0)
741     .clampScalar(0, S32, S32)
742     .clampScalar(1, S32, S64)
743     .widenScalarToNextPow2(0, 32)
744     .widenScalarToNextPow2(1, 32)
745     .lower();
746 
747   // The 64-bit versions produce 32-bit results, but only on the SALU.
748   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
749     .legalFor({{S32, S32}, {S32, S64}})
750     .clampScalar(0, S32, S32)
751     .clampScalar(1, S32, S64)
752     .scalarize(0)
753     .widenScalarToNextPow2(0, 32)
754     .widenScalarToNextPow2(1, 32);
755 
756   getActionDefinitionsBuilder(G_BITREVERSE)
757     .legalFor({S32})
758     .clampScalar(0, S32, S32)
759     .scalarize(0);
760 
761   if (ST.has16BitInsts()) {
762     getActionDefinitionsBuilder(G_BSWAP)
763       .legalFor({S16, S32, V2S16})
764       .clampMaxNumElements(0, S16, 2)
765       // FIXME: Fixing non-power-of-2 before clamp is workaround for
766       // narrowScalar limitation.
767       .widenScalarToNextPow2(0)
768       .clampScalar(0, S16, S32)
769       .scalarize(0);
770 
771     if (ST.hasVOP3PInsts()) {
772       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
773         .legalFor({S32, S16, V2S16})
774         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
775         .clampMaxNumElements(0, S16, 2)
776         .minScalar(0, S16)
777         .widenScalarToNextPow2(0)
778         .scalarize(0)
779         .lower();
780     } else {
781       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
782         .legalFor({S32, S16})
783         .widenScalarToNextPow2(0)
784         .minScalar(0, S16)
785         .scalarize(0)
786         .lower();
787     }
788   } else {
789     // TODO: Should have same legality without v_perm_b32
790     getActionDefinitionsBuilder(G_BSWAP)
791       .legalFor({S32})
792       .lowerIf(scalarNarrowerThan(0, 32))
793       // FIXME: Fixing non-power-of-2 before clamp is workaround for
794       // narrowScalar limitation.
795       .widenScalarToNextPow2(0)
796       .maxScalar(0, S32)
797       .scalarize(0)
798       .lower();
799 
800     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
801       .legalFor({S32})
802       .minScalar(0, S32)
803       .widenScalarToNextPow2(0)
804       .scalarize(0)
805       .lower();
806   }
807 
808   getActionDefinitionsBuilder(G_INTTOPTR)
809     // List the common cases
810     .legalForCartesianProduct(AddrSpaces64, {S64})
811     .legalForCartesianProduct(AddrSpaces32, {S32})
812     .scalarize(0)
813     // Accept any address space as long as the size matches
814     .legalIf(sameSize(0, 1))
815     .widenScalarIf(smallerThan(1, 0),
816       [](const LegalityQuery &Query) {
817         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
818       })
819     .narrowScalarIf(largerThan(1, 0),
820       [](const LegalityQuery &Query) {
821         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
822       });
823 
824   getActionDefinitionsBuilder(G_PTRTOINT)
825     // List the common cases
826     .legalForCartesianProduct(AddrSpaces64, {S64})
827     .legalForCartesianProduct(AddrSpaces32, {S32})
828     .scalarize(0)
829     // Accept any address space as long as the size matches
830     .legalIf(sameSize(0, 1))
831     .widenScalarIf(smallerThan(0, 1),
832       [](const LegalityQuery &Query) {
833         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
834       })
835     .narrowScalarIf(
836       largerThan(0, 1),
837       [](const LegalityQuery &Query) {
838         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
839       });
840 
841   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
842     .scalarize(0)
843     .custom();
844 
845   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
846                                     bool IsLoad) -> bool {
847     const LLT DstTy = Query.Types[0];
848 
849     // Split vector extloads.
850     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
851     unsigned Align = Query.MMODescrs[0].AlignInBits;
852 
853     if (MemSize < DstTy.getSizeInBits())
854       MemSize = std::max(MemSize, Align);
855 
856     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
857       return true;
858 
859     const LLT PtrTy = Query.Types[1];
860     unsigned AS = PtrTy.getAddressSpace();
861     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
862       return true;
863 
864     // Catch weird sized loads that don't evenly divide into the access sizes
865     // TODO: May be able to widen depending on alignment etc.
866     unsigned NumRegs = (MemSize + 31) / 32;
867     if (NumRegs == 3) {
868       if (!ST.hasDwordx3LoadStores())
869         return true;
870     } else {
871       // If the alignment allows, these should have been widened.
872       if (!isPowerOf2_32(NumRegs))
873         return true;
874     }
875 
876     if (Align < MemSize) {
877       const SITargetLowering *TLI = ST.getTargetLowering();
878       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
879     }
880 
881     return false;
882   };
883 
884   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
885                                          unsigned Opc) -> bool {
886     unsigned Size = Query.Types[0].getSizeInBits();
887     if (isPowerOf2_32(Size))
888       return false;
889 
890     if (Size == 96 && ST.hasDwordx3LoadStores())
891       return false;
892 
893     unsigned AddrSpace = Query.Types[1].getAddressSpace();
894     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
895       return false;
896 
897     unsigned Align = Query.MMODescrs[0].AlignInBits;
898     unsigned RoundedSize = NextPowerOf2(Size);
899     return (Align >= RoundedSize);
900   };
901 
902   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
903   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
904   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
905 
906   // TODO: Refine based on subtargets which support unaligned access or 128-bit
907   // LDS
908   // TODO: Unsupported flat for SI.
909 
910   for (unsigned Op : {G_LOAD, G_STORE}) {
911     const bool IsStore = Op == G_STORE;
912 
913     auto &Actions = getActionDefinitionsBuilder(Op);
914     // Whitelist some common cases.
915     // TODO: Does this help compile time at all?
916     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
917                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
918                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
919                                       {S64, GlobalPtr, 64, GlobalAlign32},
920                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
921                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
922                                       {S32, GlobalPtr, 8, GlobalAlign8},
923                                       {S32, GlobalPtr, 16, GlobalAlign16},
924 
925                                       {S32, LocalPtr, 32, 32},
926                                       {S64, LocalPtr, 64, 32},
927                                       {V2S32, LocalPtr, 64, 32},
928                                       {S32, LocalPtr, 8, 8},
929                                       {S32, LocalPtr, 16, 16},
930                                       {V2S16, LocalPtr, 32, 32},
931 
932                                       {S32, PrivatePtr, 32, 32},
933                                       {S32, PrivatePtr, 8, 8},
934                                       {S32, PrivatePtr, 16, 16},
935                                       {V2S16, PrivatePtr, 32, 32},
936 
937                                       {S32, ConstantPtr, 32, GlobalAlign32},
938                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
939                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
940                                       {S64, ConstantPtr, 64, GlobalAlign32},
941                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
942     Actions.legalIf(
943       [=](const LegalityQuery &Query) -> bool {
944         return isLoadStoreLegal(ST, Query, Op);
945       });
946 
947     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
948     // 64-bits.
949     //
950     // TODO: Should generalize bitcast action into coerce, which will also cover
951     // inserting addrspacecasts.
952     Actions.customIf(typeIs(1, Constant32Ptr));
953 
954     // Turn any illegal element vectors into something easier to deal
955     // with. These will ultimately produce 32-bit scalar shifts to extract the
956     // parts anyway.
957     //
958     // For odd 16-bit element vectors, prefer to split those into pieces with
959     // 16-bit vector parts.
960     Actions.bitcastIf(
961       [=](const LegalityQuery &Query) -> bool {
962         LLT Ty = Query.Types[0];
963         return Ty.isVector() &&
964                isRegisterSize(Ty.getSizeInBits()) &&
965                !isRegisterVectorElementType(Ty.getElementType());
966       }, bitcastToRegisterType(0));
967 
968     Actions
969         .customIf(typeIs(1, Constant32Ptr))
970         // Widen suitably aligned loads by loading extra elements.
971         .moreElementsIf([=](const LegalityQuery &Query) {
972             const LLT Ty = Query.Types[0];
973             return Op == G_LOAD && Ty.isVector() &&
974                    shouldWidenLoadResult(Query, Op);
975           }, moreElementsToNextPow2(0))
976         .widenScalarIf([=](const LegalityQuery &Query) {
977             const LLT Ty = Query.Types[0];
978             return Op == G_LOAD && !Ty.isVector() &&
979                    shouldWidenLoadResult(Query, Op);
980           }, widenScalarOrEltToNextPow2(0))
981         .narrowScalarIf(
982             [=](const LegalityQuery &Query) -> bool {
983               return !Query.Types[0].isVector() &&
984                      needToSplitMemOp(Query, Op == G_LOAD);
985             },
986             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
987               const LLT DstTy = Query.Types[0];
988               const LLT PtrTy = Query.Types[1];
989 
990               const unsigned DstSize = DstTy.getSizeInBits();
991               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
992 
993               // Split extloads.
994               if (DstSize > MemSize)
995                 return std::make_pair(0, LLT::scalar(MemSize));
996 
997               if (!isPowerOf2_32(DstSize)) {
998                 // We're probably decomposing an odd sized store. Try to split
999                 // to the widest type. TODO: Account for alignment. As-is it
1000                 // should be OK, since the new parts will be further legalized.
1001                 unsigned FloorSize = PowerOf2Floor(DstSize);
1002                 return std::make_pair(0, LLT::scalar(FloorSize));
1003               }
1004 
1005               if (DstSize > 32 && (DstSize % 32 != 0)) {
1006                 // FIXME: Need a way to specify non-extload of larger size if
1007                 // suitably aligned.
1008                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1009               }
1010 
1011               unsigned MaxSize = maxSizeForAddrSpace(ST,
1012                                                      PtrTy.getAddressSpace(),
1013                                                      Op == G_LOAD);
1014               if (MemSize > MaxSize)
1015                 return std::make_pair(0, LLT::scalar(MaxSize));
1016 
1017               unsigned Align = Query.MMODescrs[0].AlignInBits;
1018               return std::make_pair(0, LLT::scalar(Align));
1019             })
1020         .fewerElementsIf(
1021             [=](const LegalityQuery &Query) -> bool {
1022               return Query.Types[0].isVector() &&
1023                      needToSplitMemOp(Query, Op == G_LOAD);
1024             },
1025             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1026               const LLT DstTy = Query.Types[0];
1027               const LLT PtrTy = Query.Types[1];
1028 
1029               LLT EltTy = DstTy.getElementType();
1030               unsigned MaxSize = maxSizeForAddrSpace(ST,
1031                                                      PtrTy.getAddressSpace(),
1032                                                      Op == G_LOAD);
1033 
1034               // FIXME: Handle widened to power of 2 results better. This ends
1035               // up scalarizing.
1036               // FIXME: 3 element stores scalarized on SI
1037 
1038               // Split if it's too large for the address space.
1039               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1040                 unsigned NumElts = DstTy.getNumElements();
1041                 unsigned EltSize = EltTy.getSizeInBits();
1042 
1043                 if (MaxSize % EltSize == 0) {
1044                   return std::make_pair(
1045                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1046                 }
1047 
1048                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1049 
1050                 // FIXME: Refine when odd breakdowns handled
1051                 // The scalars will need to be re-legalized.
1052                 if (NumPieces == 1 || NumPieces >= NumElts ||
1053                     NumElts % NumPieces != 0)
1054                   return std::make_pair(0, EltTy);
1055 
1056                 return std::make_pair(0,
1057                                       LLT::vector(NumElts / NumPieces, EltTy));
1058               }
1059 
1060               // FIXME: We could probably handle weird extending loads better.
1061               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1062               if (DstTy.getSizeInBits() > MemSize)
1063                 return std::make_pair(0, EltTy);
1064 
1065               unsigned EltSize = EltTy.getSizeInBits();
1066               unsigned DstSize = DstTy.getSizeInBits();
1067               if (!isPowerOf2_32(DstSize)) {
1068                 // We're probably decomposing an odd sized store. Try to split
1069                 // to the widest type. TODO: Account for alignment. As-is it
1070                 // should be OK, since the new parts will be further legalized.
1071                 unsigned FloorSize = PowerOf2Floor(DstSize);
1072                 return std::make_pair(
1073                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1074               }
1075 
1076               // Need to split because of alignment.
1077               unsigned Align = Query.MMODescrs[0].AlignInBits;
1078               if (EltSize > Align &&
1079                   (EltSize / Align < DstTy.getNumElements())) {
1080                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1081               }
1082 
1083               // May need relegalization for the scalars.
1084               return std::make_pair(0, EltTy);
1085             })
1086         .minScalar(0, S32);
1087 
1088     if (IsStore)
1089       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1090 
1091     // TODO: Need a bitcast lower option?
1092     Actions
1093         .widenScalarToNextPow2(0)
1094         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1095   }
1096 
1097   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1098                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1099                                                   {S32, GlobalPtr, 16, 2 * 8},
1100                                                   {S32, LocalPtr, 8, 8},
1101                                                   {S32, LocalPtr, 16, 16},
1102                                                   {S32, PrivatePtr, 8, 8},
1103                                                   {S32, PrivatePtr, 16, 16},
1104                                                   {S32, ConstantPtr, 8, 8},
1105                                                   {S32, ConstantPtr, 16, 2 * 8}});
1106   if (ST.hasFlatAddressSpace()) {
1107     ExtLoads.legalForTypesWithMemDesc(
1108         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1109   }
1110 
1111   ExtLoads.clampScalar(0, S32, S32)
1112           .widenScalarToNextPow2(0)
1113           .unsupportedIfMemSizeNotPow2()
1114           .lower();
1115 
1116   auto &Atomics = getActionDefinitionsBuilder(
1117     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1118      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1119      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1120      G_ATOMICRMW_UMIN})
1121     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1122                {S64, GlobalPtr}, {S64, LocalPtr}});
1123   if (ST.hasFlatAddressSpace()) {
1124     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1125   }
1126 
1127   if (ST.hasLDSFPAtomics()) {
1128     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1129       .legalFor({{S32, LocalPtr}});
1130   }
1131 
1132   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1133   // demarshalling
1134   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1135     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1136                 {S32, FlatPtr}, {S64, FlatPtr}})
1137     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1138                {S32, RegionPtr}, {S64, RegionPtr}});
1139   // TODO: Pointer types, any 32-bit or 64-bit vector
1140 
1141   // Condition should be s32 for scalar, s1 for vector.
1142   getActionDefinitionsBuilder(G_SELECT)
1143     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1144           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1145           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1146     .clampScalar(0, S16, S64)
1147     .scalarize(1)
1148     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1149     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1150     .clampMaxNumElements(0, S32, 2)
1151     .clampMaxNumElements(0, LocalPtr, 2)
1152     .clampMaxNumElements(0, PrivatePtr, 2)
1153     .scalarize(0)
1154     .widenScalarToNextPow2(0)
1155     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1156 
1157   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1158   // be more flexible with the shift amount type.
1159   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1160     .legalFor({{S32, S32}, {S64, S32}});
1161   if (ST.has16BitInsts()) {
1162     if (ST.hasVOP3PInsts()) {
1163       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1164             .clampMaxNumElements(0, S16, 2);
1165     } else
1166       Shifts.legalFor({{S16, S16}});
1167 
1168     // TODO: Support 16-bit shift amounts for all types
1169     Shifts.widenScalarIf(
1170       [=](const LegalityQuery &Query) {
1171         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1172         // 32-bit amount.
1173         const LLT ValTy = Query.Types[0];
1174         const LLT AmountTy = Query.Types[1];
1175         return ValTy.getSizeInBits() <= 16 &&
1176                AmountTy.getSizeInBits() < 16;
1177       }, changeTo(1, S16));
1178     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1179     Shifts.clampScalar(1, S32, S32);
1180     Shifts.clampScalar(0, S16, S64);
1181     Shifts.widenScalarToNextPow2(0, 16);
1182   } else {
1183     // Make sure we legalize the shift amount type first, as the general
1184     // expansion for the shifted type will produce much worse code if it hasn't
1185     // been truncated already.
1186     Shifts.clampScalar(1, S32, S32);
1187     Shifts.clampScalar(0, S32, S64);
1188     Shifts.widenScalarToNextPow2(0, 32);
1189   }
1190   Shifts.scalarize(0);
1191 
1192   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1193     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1194     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1195     unsigned IdxTypeIdx = 2;
1196 
1197     getActionDefinitionsBuilder(Op)
1198       .customIf([=](const LegalityQuery &Query) {
1199           const LLT EltTy = Query.Types[EltTypeIdx];
1200           const LLT VecTy = Query.Types[VecTypeIdx];
1201           const LLT IdxTy = Query.Types[IdxTypeIdx];
1202           return (EltTy.getSizeInBits() == 16 ||
1203                   EltTy.getSizeInBits() % 32 == 0) &&
1204                  VecTy.getSizeInBits() % 32 == 0 &&
1205                  VecTy.getSizeInBits() <= 1024 &&
1206                  IdxTy.getSizeInBits() == 32;
1207         })
1208       .clampScalar(EltTypeIdx, S32, S64)
1209       .clampScalar(VecTypeIdx, S32, S64)
1210       .clampScalar(IdxTypeIdx, S32, S32);
1211   }
1212 
1213   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1214     .unsupportedIf([=](const LegalityQuery &Query) {
1215         const LLT &EltTy = Query.Types[1].getElementType();
1216         return Query.Types[0] != EltTy;
1217       });
1218 
1219   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1220     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1221     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1222 
1223     // FIXME: Doesn't handle extract of illegal sizes.
1224     getActionDefinitionsBuilder(Op)
1225       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1226       // FIXME: Multiples of 16 should not be legal.
1227       .legalIf([=](const LegalityQuery &Query) {
1228           const LLT BigTy = Query.Types[BigTyIdx];
1229           const LLT LitTy = Query.Types[LitTyIdx];
1230           return (BigTy.getSizeInBits() % 32 == 0) &&
1231                  (LitTy.getSizeInBits() % 16 == 0);
1232         })
1233       .widenScalarIf(
1234         [=](const LegalityQuery &Query) {
1235           const LLT BigTy = Query.Types[BigTyIdx];
1236           return (BigTy.getScalarSizeInBits() < 16);
1237         },
1238         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1239       .widenScalarIf(
1240         [=](const LegalityQuery &Query) {
1241           const LLT LitTy = Query.Types[LitTyIdx];
1242           return (LitTy.getScalarSizeInBits() < 16);
1243         },
1244         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1245       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1246       .widenScalarToNextPow2(BigTyIdx, 32);
1247 
1248   }
1249 
1250   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1251     .legalForCartesianProduct(AllS32Vectors, {S32})
1252     .legalForCartesianProduct(AllS64Vectors, {S64})
1253     .clampNumElements(0, V16S32, V32S32)
1254     .clampNumElements(0, V2S64, V16S64)
1255     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1256 
1257   if (ST.hasScalarPackInsts()) {
1258     BuildVector
1259       // FIXME: Should probably widen s1 vectors straight to s32
1260       .minScalarOrElt(0, S16)
1261       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1262       .minScalar(1, S32);
1263 
1264     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1265       .legalFor({V2S16, S32})
1266       .lower();
1267     BuildVector.minScalarOrElt(0, S32);
1268   } else {
1269     BuildVector.customFor({V2S16, S16});
1270     BuildVector.minScalarOrElt(0, S32);
1271 
1272     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1273       .customFor({V2S16, S32})
1274       .lower();
1275   }
1276 
1277   BuildVector.legalIf(isRegisterType(0));
1278 
1279   // FIXME: Clamp maximum size
1280   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1281     .legalIf(isRegisterType(0));
1282 
1283   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1284   // pre-legalize.
1285   if (ST.hasVOP3PInsts()) {
1286     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1287       .customFor({V2S16, V2S16})
1288       .lower();
1289   } else
1290     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1291 
1292   // Merge/Unmerge
1293   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1294     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1295     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1296 
1297     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1298       const LLT Ty = Query.Types[TypeIdx];
1299       if (Ty.isVector()) {
1300         const LLT &EltTy = Ty.getElementType();
1301         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1302           return true;
1303         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1304           return true;
1305       }
1306       return false;
1307     };
1308 
1309     auto &Builder = getActionDefinitionsBuilder(Op)
1310       .lowerFor({{S16, V2S16}})
1311       .lowerIf([=](const LegalityQuery &Query) {
1312           const LLT BigTy = Query.Types[BigTyIdx];
1313           return BigTy.getSizeInBits() == 32;
1314         })
1315       // Try to widen to s16 first for small types.
1316       // TODO: Only do this on targets with legal s16 shifts
1317       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1318       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1319       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1320       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1321                            elementTypeIs(1, S16)),
1322                        changeTo(1, V2S16))
1323       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1324       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1325       // valid.
1326       .clampScalar(LitTyIdx, S32, S512)
1327       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1328       // Break up vectors with weird elements into scalars
1329       .fewerElementsIf(
1330         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1331         scalarize(0))
1332       .fewerElementsIf(
1333         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1334         scalarize(1))
1335       .clampScalar(BigTyIdx, S32, S1024);
1336 
1337     if (Op == G_MERGE_VALUES) {
1338       Builder.widenScalarIf(
1339         // TODO: Use 16-bit shifts if legal for 8-bit values?
1340         [=](const LegalityQuery &Query) {
1341           const LLT Ty = Query.Types[LitTyIdx];
1342           return Ty.getSizeInBits() < 32;
1343         },
1344         changeTo(LitTyIdx, S32));
1345     }
1346 
1347     Builder.widenScalarIf(
1348       [=](const LegalityQuery &Query) {
1349         const LLT Ty = Query.Types[BigTyIdx];
1350         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1351           Ty.getSizeInBits() % 16 != 0;
1352       },
1353       [=](const LegalityQuery &Query) {
1354         // Pick the next power of 2, or a multiple of 64 over 128.
1355         // Whichever is smaller.
1356         const LLT &Ty = Query.Types[BigTyIdx];
1357         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1358         if (NewSizeInBits >= 256) {
1359           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1360           if (RoundedTo < NewSizeInBits)
1361             NewSizeInBits = RoundedTo;
1362         }
1363         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1364       })
1365       .legalIf([=](const LegalityQuery &Query) {
1366           const LLT &BigTy = Query.Types[BigTyIdx];
1367           const LLT &LitTy = Query.Types[LitTyIdx];
1368 
1369           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1370             return false;
1371           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1372             return false;
1373 
1374           return BigTy.getSizeInBits() % 16 == 0 &&
1375                  LitTy.getSizeInBits() % 16 == 0 &&
1376                  BigTy.getSizeInBits() <= 1024;
1377         })
1378       // Any vectors left are the wrong size. Scalarize them.
1379       .scalarize(0)
1380       .scalarize(1);
1381   }
1382 
1383   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1384   // RegBankSelect.
1385   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1386     .legalFor({{S32}, {S64}});
1387 
1388   if (ST.hasVOP3PInsts()) {
1389     SextInReg.lowerFor({{V2S16}})
1390       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1391       // get more vector shift opportunities, since we'll get those when
1392       // expanded.
1393       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1394   } else if (ST.has16BitInsts()) {
1395     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1396   } else {
1397     // Prefer to promote to s32 before lowering if we don't have 16-bit
1398     // shifts. This avoid a lot of intermediate truncate and extend operations.
1399     SextInReg.lowerFor({{S32}, {S64}});
1400   }
1401 
1402   SextInReg
1403     .scalarize(0)
1404     .clampScalar(0, S32, S64)
1405     .lower();
1406 
1407   getActionDefinitionsBuilder(G_FSHR)
1408     .legalFor({{S32, S32}})
1409     .scalarize(0)
1410     .lower();
1411 
1412   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1413     .legalFor({S64});
1414 
1415   getActionDefinitionsBuilder({
1416       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1417       G_FCOPYSIGN,
1418 
1419       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1420       G_READ_REGISTER,
1421       G_WRITE_REGISTER,
1422 
1423       G_SADDO, G_SSUBO,
1424 
1425        // TODO: Implement
1426       G_FMINIMUM, G_FMAXIMUM,
1427       G_FSHL
1428     }).lower();
1429 
1430   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1431         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1432         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1433     .unsupported();
1434 
1435   computeTables();
1436   verify(*ST.getInstrInfo());
1437 }
1438 
1439 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1440                                          MachineRegisterInfo &MRI,
1441                                          MachineIRBuilder &B,
1442                                          GISelChangeObserver &Observer) const {
1443   switch (MI.getOpcode()) {
1444   case TargetOpcode::G_ADDRSPACE_CAST:
1445     return legalizeAddrSpaceCast(MI, MRI, B);
1446   case TargetOpcode::G_FRINT:
1447     return legalizeFrint(MI, MRI, B);
1448   case TargetOpcode::G_FCEIL:
1449     return legalizeFceil(MI, MRI, B);
1450   case TargetOpcode::G_INTRINSIC_TRUNC:
1451     return legalizeIntrinsicTrunc(MI, MRI, B);
1452   case TargetOpcode::G_SITOFP:
1453     return legalizeITOFP(MI, MRI, B, true);
1454   case TargetOpcode::G_UITOFP:
1455     return legalizeITOFP(MI, MRI, B, false);
1456   case TargetOpcode::G_FPTOSI:
1457     return legalizeFPTOI(MI, MRI, B, true);
1458   case TargetOpcode::G_FPTOUI:
1459     return legalizeFPTOI(MI, MRI, B, false);
1460   case TargetOpcode::G_FMINNUM:
1461   case TargetOpcode::G_FMAXNUM:
1462   case TargetOpcode::G_FMINNUM_IEEE:
1463   case TargetOpcode::G_FMAXNUM_IEEE:
1464     return legalizeMinNumMaxNum(MI, MRI, B);
1465   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1466     return legalizeExtractVectorElt(MI, MRI, B);
1467   case TargetOpcode::G_INSERT_VECTOR_ELT:
1468     return legalizeInsertVectorElt(MI, MRI, B);
1469   case TargetOpcode::G_SHUFFLE_VECTOR:
1470     return legalizeShuffleVector(MI, MRI, B);
1471   case TargetOpcode::G_FSIN:
1472   case TargetOpcode::G_FCOS:
1473     return legalizeSinCos(MI, MRI, B);
1474   case TargetOpcode::G_GLOBAL_VALUE:
1475     return legalizeGlobalValue(MI, MRI, B);
1476   case TargetOpcode::G_LOAD:
1477     return legalizeLoad(MI, MRI, B, Observer);
1478   case TargetOpcode::G_FMAD:
1479     return legalizeFMad(MI, MRI, B);
1480   case TargetOpcode::G_FDIV:
1481     return legalizeFDIV(MI, MRI, B);
1482   case TargetOpcode::G_UDIV:
1483   case TargetOpcode::G_UREM:
1484     return legalizeUDIV_UREM(MI, MRI, B);
1485   case TargetOpcode::G_SDIV:
1486   case TargetOpcode::G_SREM:
1487     return legalizeSDIV_SREM(MI, MRI, B);
1488   case TargetOpcode::G_ATOMIC_CMPXCHG:
1489     return legalizeAtomicCmpXChg(MI, MRI, B);
1490   case TargetOpcode::G_FLOG:
1491     return legalizeFlog(MI, B, numbers::ln2f);
1492   case TargetOpcode::G_FLOG10:
1493     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1494   case TargetOpcode::G_FEXP:
1495     return legalizeFExp(MI, B);
1496   case TargetOpcode::G_FPOW:
1497     return legalizeFPow(MI, B);
1498   case TargetOpcode::G_FFLOOR:
1499     return legalizeFFloor(MI, MRI, B);
1500   case TargetOpcode::G_BUILD_VECTOR:
1501     return legalizeBuildVector(MI, MRI, B);
1502   default:
1503     return false;
1504   }
1505 
1506   llvm_unreachable("expected switch to return");
1507 }
1508 
1509 Register AMDGPULegalizerInfo::getSegmentAperture(
1510   unsigned AS,
1511   MachineRegisterInfo &MRI,
1512   MachineIRBuilder &B) const {
1513   MachineFunction &MF = B.getMF();
1514   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1515   const LLT S32 = LLT::scalar(32);
1516 
1517   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1518 
1519   if (ST.hasApertureRegs()) {
1520     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1521     // getreg.
1522     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1523         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1524         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1525     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1526         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1527         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1528     unsigned Encoding =
1529         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1530         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1531         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1532 
1533     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1534 
1535     B.buildInstr(AMDGPU::S_GETREG_B32)
1536       .addDef(GetReg)
1537       .addImm(Encoding);
1538     MRI.setType(GetReg, S32);
1539 
1540     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1541     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1542   }
1543 
1544   Register QueuePtr = MRI.createGenericVirtualRegister(
1545     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1546 
1547   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1548   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1549     return Register();
1550 
1551   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1552   // private_segment_aperture_base_hi.
1553   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1554 
1555   // TODO: can we be smarter about machine pointer info?
1556   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1557   MachineMemOperand *MMO = MF.getMachineMemOperand(
1558       PtrInfo,
1559       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1560           MachineMemOperand::MOInvariant,
1561       4, commonAlignment(Align(64), StructOffset));
1562 
1563   Register LoadAddr;
1564 
1565   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1566   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1567 }
1568 
1569 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1570   MachineInstr &MI, MachineRegisterInfo &MRI,
1571   MachineIRBuilder &B) const {
1572   MachineFunction &MF = B.getMF();
1573 
1574   B.setInstr(MI);
1575 
1576   const LLT S32 = LLT::scalar(32);
1577   Register Dst = MI.getOperand(0).getReg();
1578   Register Src = MI.getOperand(1).getReg();
1579 
1580   LLT DstTy = MRI.getType(Dst);
1581   LLT SrcTy = MRI.getType(Src);
1582   unsigned DestAS = DstTy.getAddressSpace();
1583   unsigned SrcAS = SrcTy.getAddressSpace();
1584 
1585   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1586   // vector element.
1587   assert(!DstTy.isVector());
1588 
1589   const AMDGPUTargetMachine &TM
1590     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1591 
1592   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1593   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1594     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1595     return true;
1596   }
1597 
1598   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1599     // Truncate.
1600     B.buildExtract(Dst, Src, 0);
1601     MI.eraseFromParent();
1602     return true;
1603   }
1604 
1605   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1606     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1607     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1608 
1609     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1610     // another. Merge operands are required to be the same type, but creating an
1611     // extra ptrtoint would be kind of pointless.
1612     auto HighAddr = B.buildConstant(
1613       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1614     B.buildMerge(Dst, {Src, HighAddr});
1615     MI.eraseFromParent();
1616     return true;
1617   }
1618 
1619   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1620     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1621            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1622     unsigned NullVal = TM.getNullPointerValue(DestAS);
1623 
1624     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1625     auto FlatNull = B.buildConstant(SrcTy, 0);
1626 
1627     // Extract low 32-bits of the pointer.
1628     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1629 
1630     auto CmpRes =
1631         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1632     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1633 
1634     MI.eraseFromParent();
1635     return true;
1636   }
1637 
1638   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1639     return false;
1640 
1641   if (!ST.hasFlatAddressSpace())
1642     return false;
1643 
1644   auto SegmentNull =
1645       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1646   auto FlatNull =
1647       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1648 
1649   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1650   if (!ApertureReg.isValid())
1651     return false;
1652 
1653   auto CmpRes =
1654       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1655 
1656   // Coerce the type of the low half of the result so we can use merge_values.
1657   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1658 
1659   // TODO: Should we allow mismatched types but matching sizes in merges to
1660   // avoid the ptrtoint?
1661   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1662   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1663 
1664   MI.eraseFromParent();
1665   return true;
1666 }
1667 
1668 bool AMDGPULegalizerInfo::legalizeFrint(
1669   MachineInstr &MI, MachineRegisterInfo &MRI,
1670   MachineIRBuilder &B) const {
1671   B.setInstr(MI);
1672 
1673   Register Src = MI.getOperand(1).getReg();
1674   LLT Ty = MRI.getType(Src);
1675   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1676 
1677   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1678   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1679 
1680   auto C1 = B.buildFConstant(Ty, C1Val);
1681   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1682 
1683   // TODO: Should this propagate fast-math-flags?
1684   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1685   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1686 
1687   auto C2 = B.buildFConstant(Ty, C2Val);
1688   auto Fabs = B.buildFAbs(Ty, Src);
1689 
1690   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1691   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1692   return true;
1693 }
1694 
1695 bool AMDGPULegalizerInfo::legalizeFceil(
1696   MachineInstr &MI, MachineRegisterInfo &MRI,
1697   MachineIRBuilder &B) const {
1698   B.setInstr(MI);
1699 
1700   const LLT S1 = LLT::scalar(1);
1701   const LLT S64 = LLT::scalar(64);
1702 
1703   Register Src = MI.getOperand(1).getReg();
1704   assert(MRI.getType(Src) == S64);
1705 
1706   // result = trunc(src)
1707   // if (src > 0.0 && src != result)
1708   //   result += 1.0
1709 
1710   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1711 
1712   const auto Zero = B.buildFConstant(S64, 0.0);
1713   const auto One = B.buildFConstant(S64, 1.0);
1714   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1715   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1716   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1717   auto Add = B.buildSelect(S64, And, One, Zero);
1718 
1719   // TODO: Should this propagate fast-math-flags?
1720   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1721   return true;
1722 }
1723 
1724 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1725                                               MachineIRBuilder &B) {
1726   const unsigned FractBits = 52;
1727   const unsigned ExpBits = 11;
1728   LLT S32 = LLT::scalar(32);
1729 
1730   auto Const0 = B.buildConstant(S32, FractBits - 32);
1731   auto Const1 = B.buildConstant(S32, ExpBits);
1732 
1733   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1734     .addUse(Const0.getReg(0))
1735     .addUse(Const1.getReg(0));
1736 
1737   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1738 }
1739 
1740 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1741   MachineInstr &MI, MachineRegisterInfo &MRI,
1742   MachineIRBuilder &B) const {
1743   B.setInstr(MI);
1744 
1745   const LLT S1 = LLT::scalar(1);
1746   const LLT S32 = LLT::scalar(32);
1747   const LLT S64 = LLT::scalar(64);
1748 
1749   Register Src = MI.getOperand(1).getReg();
1750   assert(MRI.getType(Src) == S64);
1751 
1752   // TODO: Should this use extract since the low half is unused?
1753   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1754   Register Hi = Unmerge.getReg(1);
1755 
1756   // Extract the upper half, since this is where we will find the sign and
1757   // exponent.
1758   auto Exp = extractF64Exponent(Hi, B);
1759 
1760   const unsigned FractBits = 52;
1761 
1762   // Extract the sign bit.
1763   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1764   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1765 
1766   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1767 
1768   const auto Zero32 = B.buildConstant(S32, 0);
1769 
1770   // Extend back to 64-bits.
1771   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1772 
1773   auto Shr = B.buildAShr(S64, FractMask, Exp);
1774   auto Not = B.buildNot(S64, Shr);
1775   auto Tmp0 = B.buildAnd(S64, Src, Not);
1776   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1777 
1778   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1779   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1780 
1781   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1782   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1783   return true;
1784 }
1785 
1786 bool AMDGPULegalizerInfo::legalizeITOFP(
1787   MachineInstr &MI, MachineRegisterInfo &MRI,
1788   MachineIRBuilder &B, bool Signed) const {
1789   B.setInstr(MI);
1790 
1791   Register Dst = MI.getOperand(0).getReg();
1792   Register Src = MI.getOperand(1).getReg();
1793 
1794   const LLT S64 = LLT::scalar(64);
1795   const LLT S32 = LLT::scalar(32);
1796 
1797   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1798 
1799   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1800 
1801   auto CvtHi = Signed ?
1802     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1803     B.buildUITOFP(S64, Unmerge.getReg(1));
1804 
1805   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1806 
1807   auto ThirtyTwo = B.buildConstant(S32, 32);
1808   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1809     .addUse(CvtHi.getReg(0))
1810     .addUse(ThirtyTwo.getReg(0));
1811 
1812   // TODO: Should this propagate fast-math-flags?
1813   B.buildFAdd(Dst, LdExp, CvtLo);
1814   MI.eraseFromParent();
1815   return true;
1816 }
1817 
1818 // TODO: Copied from DAG implementation. Verify logic and document how this
1819 // actually works.
1820 bool AMDGPULegalizerInfo::legalizeFPTOI(
1821   MachineInstr &MI, MachineRegisterInfo &MRI,
1822   MachineIRBuilder &B, bool Signed) const {
1823   B.setInstr(MI);
1824 
1825   Register Dst = MI.getOperand(0).getReg();
1826   Register Src = MI.getOperand(1).getReg();
1827 
1828   const LLT S64 = LLT::scalar(64);
1829   const LLT S32 = LLT::scalar(32);
1830 
1831   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1832 
1833   unsigned Flags = MI.getFlags();
1834 
1835   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1836   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1837   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1838 
1839   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1840   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1841   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1842 
1843   auto Hi = Signed ?
1844     B.buildFPTOSI(S32, FloorMul) :
1845     B.buildFPTOUI(S32, FloorMul);
1846   auto Lo = B.buildFPTOUI(S32, Fma);
1847 
1848   B.buildMerge(Dst, { Lo, Hi });
1849   MI.eraseFromParent();
1850 
1851   return true;
1852 }
1853 
1854 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1855   MachineInstr &MI, MachineRegisterInfo &MRI,
1856   MachineIRBuilder &B) const {
1857   MachineFunction &MF = B.getMF();
1858   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1859 
1860   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1861                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1862 
1863   // With ieee_mode disabled, the instructions have the correct behavior
1864   // already for G_FMINNUM/G_FMAXNUM
1865   if (!MFI->getMode().IEEE)
1866     return !IsIEEEOp;
1867 
1868   if (IsIEEEOp)
1869     return true;
1870 
1871   MachineIRBuilder HelperBuilder(MI);
1872   GISelObserverWrapper DummyObserver;
1873   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1874   HelperBuilder.setInstr(MI);
1875   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1876 }
1877 
1878 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1879   MachineInstr &MI, MachineRegisterInfo &MRI,
1880   MachineIRBuilder &B) const {
1881   // TODO: Should move some of this into LegalizerHelper.
1882 
1883   // TODO: Promote dynamic indexing of s16 to s32
1884 
1885   // FIXME: Artifact combiner probably should have replaced the truncated
1886   // constant before this, so we shouldn't need
1887   // getConstantVRegValWithLookThrough.
1888   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1889     MI.getOperand(2).getReg(), MRI);
1890   if (!IdxVal) // Dynamic case will be selected to register indexing.
1891     return true;
1892 
1893   Register Dst = MI.getOperand(0).getReg();
1894   Register Vec = MI.getOperand(1).getReg();
1895 
1896   LLT VecTy = MRI.getType(Vec);
1897   LLT EltTy = VecTy.getElementType();
1898   assert(EltTy == MRI.getType(Dst));
1899 
1900   B.setInstr(MI);
1901 
1902   if (IdxVal->Value < VecTy.getNumElements())
1903     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1904   else
1905     B.buildUndef(Dst);
1906 
1907   MI.eraseFromParent();
1908   return true;
1909 }
1910 
1911 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1912   MachineInstr &MI, MachineRegisterInfo &MRI,
1913   MachineIRBuilder &B) const {
1914   // TODO: Should move some of this into LegalizerHelper.
1915 
1916   // TODO: Promote dynamic indexing of s16 to s32
1917 
1918   // FIXME: Artifact combiner probably should have replaced the truncated
1919   // constant before this, so we shouldn't need
1920   // getConstantVRegValWithLookThrough.
1921   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1922     MI.getOperand(3).getReg(), MRI);
1923   if (!IdxVal) // Dynamic case will be selected to register indexing.
1924     return true;
1925 
1926   Register Dst = MI.getOperand(0).getReg();
1927   Register Vec = MI.getOperand(1).getReg();
1928   Register Ins = MI.getOperand(2).getReg();
1929 
1930   LLT VecTy = MRI.getType(Vec);
1931   LLT EltTy = VecTy.getElementType();
1932   assert(EltTy == MRI.getType(Ins));
1933 
1934   B.setInstr(MI);
1935 
1936   if (IdxVal->Value < VecTy.getNumElements())
1937     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1938   else
1939     B.buildUndef(Dst);
1940 
1941   MI.eraseFromParent();
1942   return true;
1943 }
1944 
1945 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1946   MachineInstr &MI, MachineRegisterInfo &MRI,
1947   MachineIRBuilder &B) const {
1948   const LLT V2S16 = LLT::vector(2, 16);
1949 
1950   Register Dst = MI.getOperand(0).getReg();
1951   Register Src0 = MI.getOperand(1).getReg();
1952   LLT DstTy = MRI.getType(Dst);
1953   LLT SrcTy = MRI.getType(Src0);
1954 
1955   if (SrcTy == V2S16 && DstTy == V2S16 &&
1956       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1957     return true;
1958 
1959   MachineIRBuilder HelperBuilder(MI);
1960   GISelObserverWrapper DummyObserver;
1961   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1962   HelperBuilder.setInstr(MI);
1963   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1964 }
1965 
1966 bool AMDGPULegalizerInfo::legalizeSinCos(
1967   MachineInstr &MI, MachineRegisterInfo &MRI,
1968   MachineIRBuilder &B) const {
1969   B.setInstr(MI);
1970 
1971   Register DstReg = MI.getOperand(0).getReg();
1972   Register SrcReg = MI.getOperand(1).getReg();
1973   LLT Ty = MRI.getType(DstReg);
1974   unsigned Flags = MI.getFlags();
1975 
1976   Register TrigVal;
1977   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1978   if (ST.hasTrigReducedRange()) {
1979     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1980     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1981       .addUse(MulVal.getReg(0))
1982       .setMIFlags(Flags).getReg(0);
1983   } else
1984     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1985 
1986   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1987     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1988   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1989     .addUse(TrigVal)
1990     .setMIFlags(Flags);
1991   MI.eraseFromParent();
1992   return true;
1993 }
1994 
1995 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1996   Register DstReg, LLT PtrTy,
1997   MachineIRBuilder &B, const GlobalValue *GV,
1998   unsigned Offset, unsigned GAFlags) const {
1999   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2000   // to the following code sequence:
2001   //
2002   // For constant address space:
2003   //   s_getpc_b64 s[0:1]
2004   //   s_add_u32 s0, s0, $symbol
2005   //   s_addc_u32 s1, s1, 0
2006   //
2007   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2008   //   a fixup or relocation is emitted to replace $symbol with a literal
2009   //   constant, which is a pc-relative offset from the encoding of the $symbol
2010   //   operand to the global variable.
2011   //
2012   // For global address space:
2013   //   s_getpc_b64 s[0:1]
2014   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2015   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2016   //
2017   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2018   //   fixups or relocations are emitted to replace $symbol@*@lo and
2019   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2020   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2021   //   operand to the global variable.
2022   //
2023   // What we want here is an offset from the value returned by s_getpc
2024   // (which is the address of the s_add_u32 instruction) to the global
2025   // variable, but since the encoding of $symbol starts 4 bytes after the start
2026   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2027   // small. This requires us to add 4 to the global variable offset in order to
2028   // compute the correct address.
2029 
2030   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2031 
2032   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2033     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2034 
2035   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2036     .addDef(PCReg);
2037 
2038   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2039   if (GAFlags == SIInstrInfo::MO_NONE)
2040     MIB.addImm(0);
2041   else
2042     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2043 
2044   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2045 
2046   if (PtrTy.getSizeInBits() == 32)
2047     B.buildExtract(DstReg, PCReg, 0);
2048   return true;
2049  }
2050 
2051 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2052   MachineInstr &MI, MachineRegisterInfo &MRI,
2053   MachineIRBuilder &B) const {
2054   Register DstReg = MI.getOperand(0).getReg();
2055   LLT Ty = MRI.getType(DstReg);
2056   unsigned AS = Ty.getAddressSpace();
2057 
2058   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2059   MachineFunction &MF = B.getMF();
2060   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2061   B.setInstr(MI);
2062 
2063   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2064     if (!MFI->isEntryFunction()) {
2065       const Function &Fn = MF.getFunction();
2066       DiagnosticInfoUnsupported BadLDSDecl(
2067         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2068         DS_Warning);
2069       Fn.getContext().diagnose(BadLDSDecl);
2070 
2071       // We currently don't have a way to correctly allocate LDS objects that
2072       // aren't directly associated with a kernel. We do force inlining of
2073       // functions that use local objects. However, if these dead functions are
2074       // not eliminated, we don't want a compile time error. Just emit a warning
2075       // and a trap, since there should be no callable path here.
2076       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2077       B.buildUndef(DstReg);
2078       MI.eraseFromParent();
2079       return true;
2080     }
2081 
2082     // TODO: We could emit code to handle the initialization somewhere.
2083     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2084       const SITargetLowering *TLI = ST.getTargetLowering();
2085       if (!TLI->shouldUseLDSConstAddress(GV)) {
2086         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2087         return true; // Leave in place;
2088       }
2089 
2090       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2091       MI.eraseFromParent();
2092       return true;
2093     }
2094 
2095     const Function &Fn = MF.getFunction();
2096     DiagnosticInfoUnsupported BadInit(
2097       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2098     Fn.getContext().diagnose(BadInit);
2099     return true;
2100   }
2101 
2102   const SITargetLowering *TLI = ST.getTargetLowering();
2103 
2104   if (TLI->shouldEmitFixup(GV)) {
2105     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2106     MI.eraseFromParent();
2107     return true;
2108   }
2109 
2110   if (TLI->shouldEmitPCReloc(GV)) {
2111     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2112     MI.eraseFromParent();
2113     return true;
2114   }
2115 
2116   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2117   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2118 
2119   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2120       MachinePointerInfo::getGOT(MF),
2121       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2122           MachineMemOperand::MOInvariant,
2123       8 /*Size*/, Align(8));
2124 
2125   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2126 
2127   if (Ty.getSizeInBits() == 32) {
2128     // Truncate if this is a 32-bit constant adrdess.
2129     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2130     B.buildExtract(DstReg, Load, 0);
2131   } else
2132     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2133 
2134   MI.eraseFromParent();
2135   return true;
2136 }
2137 
2138 bool AMDGPULegalizerInfo::legalizeLoad(
2139   MachineInstr &MI, MachineRegisterInfo &MRI,
2140   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2141   B.setInstr(MI);
2142   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2143   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2144   Observer.changingInstr(MI);
2145   MI.getOperand(1).setReg(Cast.getReg(0));
2146   Observer.changedInstr(MI);
2147   return true;
2148 }
2149 
2150 bool AMDGPULegalizerInfo::legalizeFMad(
2151   MachineInstr &MI, MachineRegisterInfo &MRI,
2152   MachineIRBuilder &B) const {
2153   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2154   assert(Ty.isScalar());
2155 
2156   MachineFunction &MF = B.getMF();
2157   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2158 
2159   // TODO: Always legal with future ftz flag.
2160   // FIXME: Do we need just output?
2161   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2162     return true;
2163   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2164     return true;
2165 
2166   MachineIRBuilder HelperBuilder(MI);
2167   GISelObserverWrapper DummyObserver;
2168   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2169   HelperBuilder.setInstr(MI);
2170   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2171 }
2172 
2173 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2174   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2175   Register DstReg = MI.getOperand(0).getReg();
2176   Register PtrReg = MI.getOperand(1).getReg();
2177   Register CmpVal = MI.getOperand(2).getReg();
2178   Register NewVal = MI.getOperand(3).getReg();
2179 
2180   assert(SITargetLowering::isFlatGlobalAddrSpace(
2181            MRI.getType(PtrReg).getAddressSpace()) &&
2182          "this should not have been custom lowered");
2183 
2184   LLT ValTy = MRI.getType(CmpVal);
2185   LLT VecTy = LLT::vector(2, ValTy);
2186 
2187   B.setInstr(MI);
2188   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2189 
2190   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2191     .addDef(DstReg)
2192     .addUse(PtrReg)
2193     .addUse(PackedVal)
2194     .setMemRefs(MI.memoperands());
2195 
2196   MI.eraseFromParent();
2197   return true;
2198 }
2199 
2200 bool AMDGPULegalizerInfo::legalizeFlog(
2201   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2202   Register Dst = MI.getOperand(0).getReg();
2203   Register Src = MI.getOperand(1).getReg();
2204   LLT Ty = B.getMRI()->getType(Dst);
2205   unsigned Flags = MI.getFlags();
2206   B.setInstr(MI);
2207 
2208   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2209   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2210 
2211   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2212   MI.eraseFromParent();
2213   return true;
2214 }
2215 
2216 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2217                                        MachineIRBuilder &B) const {
2218   Register Dst = MI.getOperand(0).getReg();
2219   Register Src = MI.getOperand(1).getReg();
2220   unsigned Flags = MI.getFlags();
2221   LLT Ty = B.getMRI()->getType(Dst);
2222   B.setInstr(MI);
2223 
2224   auto K = B.buildFConstant(Ty, numbers::log2e);
2225   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2226   B.buildFExp2(Dst, Mul, Flags);
2227   MI.eraseFromParent();
2228   return true;
2229 }
2230 
2231 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2232                                        MachineIRBuilder &B) const {
2233   Register Dst = MI.getOperand(0).getReg();
2234   Register Src0 = MI.getOperand(1).getReg();
2235   Register Src1 = MI.getOperand(2).getReg();
2236   unsigned Flags = MI.getFlags();
2237   LLT Ty = B.getMRI()->getType(Dst);
2238   B.setInstr(MI);
2239   const LLT S16 = LLT::scalar(16);
2240   const LLT S32 = LLT::scalar(32);
2241 
2242   if (Ty == S32) {
2243     auto Log = B.buildFLog2(S32, Src0, Flags);
2244     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2245       .addUse(Log.getReg(0))
2246       .addUse(Src1)
2247       .setMIFlags(Flags);
2248     B.buildFExp2(Dst, Mul, Flags);
2249   } else if (Ty == S16) {
2250     // There's no f16 fmul_legacy, so we need to convert for it.
2251     auto Log = B.buildFLog2(S16, Src0, Flags);
2252     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2253     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2254     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2255       .addUse(Ext0.getReg(0))
2256       .addUse(Ext1.getReg(0))
2257       .setMIFlags(Flags);
2258 
2259     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2260   } else
2261     return false;
2262 
2263   MI.eraseFromParent();
2264   return true;
2265 }
2266 
2267 // Find a source register, ignoring any possible source modifiers.
2268 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2269   Register ModSrc = OrigSrc;
2270   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2271     ModSrc = SrcFNeg->getOperand(1).getReg();
2272     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2273       ModSrc = SrcFAbs->getOperand(1).getReg();
2274   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2275     ModSrc = SrcFAbs->getOperand(1).getReg();
2276   return ModSrc;
2277 }
2278 
2279 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2280                                          MachineRegisterInfo &MRI,
2281                                          MachineIRBuilder &B) const {
2282   B.setInstr(MI);
2283 
2284   const LLT S1 = LLT::scalar(1);
2285   const LLT S64 = LLT::scalar(64);
2286   Register Dst = MI.getOperand(0).getReg();
2287   Register OrigSrc = MI.getOperand(1).getReg();
2288   unsigned Flags = MI.getFlags();
2289   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2290          "this should not have been custom lowered");
2291 
2292   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2293   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2294   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2295   // V_FRACT bug is:
2296   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2297   //
2298   // Convert floor(x) to (x - fract(x))
2299 
2300   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2301     .addUse(OrigSrc)
2302     .setMIFlags(Flags);
2303 
2304   // Give source modifier matching some assistance before obscuring a foldable
2305   // pattern.
2306 
2307   // TODO: We can avoid the neg on the fract? The input sign to fract
2308   // shouldn't matter?
2309   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2310 
2311   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2312 
2313   Register Min = MRI.createGenericVirtualRegister(S64);
2314 
2315   // We don't need to concern ourselves with the snan handling difference, so
2316   // use the one which will directly select.
2317   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2318   if (MFI->getMode().IEEE)
2319     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2320   else
2321     B.buildFMinNum(Min, Fract, Const, Flags);
2322 
2323   Register CorrectedFract = Min;
2324   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2325     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2326     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2327   }
2328 
2329   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2330   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2331 
2332   MI.eraseFromParent();
2333   return true;
2334 }
2335 
2336 // Turn an illegal packed v2s16 build vector into bit operations.
2337 // TODO: This should probably be a bitcast action in LegalizerHelper.
2338 bool AMDGPULegalizerInfo::legalizeBuildVector(
2339   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2340   Register Dst = MI.getOperand(0).getReg();
2341   const LLT S32 = LLT::scalar(32);
2342   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2343 
2344   Register Src0 = MI.getOperand(1).getReg();
2345   Register Src1 = MI.getOperand(2).getReg();
2346   assert(MRI.getType(Src0) == LLT::scalar(16));
2347 
2348   B.setInstr(MI);
2349   auto Merge = B.buildMerge(S32, {Src0, Src1});
2350   B.buildBitcast(Dst, Merge);
2351 
2352   MI.eraseFromParent();
2353   return true;
2354 }
2355 
2356 // Return the use branch instruction, otherwise null if the usage is invalid.
2357 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2358                                        MachineRegisterInfo &MRI,
2359                                        MachineInstr *&Br,
2360                                        MachineBasicBlock *&UncondBrTarget) {
2361   Register CondDef = MI.getOperand(0).getReg();
2362   if (!MRI.hasOneNonDBGUse(CondDef))
2363     return nullptr;
2364 
2365   MachineBasicBlock *Parent = MI.getParent();
2366   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2367   if (UseMI.getParent() != Parent ||
2368       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2369     return nullptr;
2370 
2371   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2372   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2373   if (Next == Parent->end()) {
2374     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2375     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2376       return nullptr;
2377     UncondBrTarget = &*NextMBB;
2378   } else {
2379     if (Next->getOpcode() != AMDGPU::G_BR)
2380       return nullptr;
2381     Br = &*Next;
2382     UncondBrTarget = Br->getOperand(0).getMBB();
2383   }
2384 
2385   return &UseMI;
2386 }
2387 
2388 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2389                                                MachineRegisterInfo &MRI,
2390                                                Register LiveIn,
2391                                                Register PhyReg) const {
2392   assert(PhyReg.isPhysical() && "Physical register expected");
2393 
2394   // Insert the live-in copy, if required, by defining destination virtual
2395   // register.
2396   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2397   if (!MRI.getVRegDef(LiveIn)) {
2398     // FIXME: Should have scoped insert pt
2399     MachineBasicBlock &OrigInsBB = B.getMBB();
2400     auto OrigInsPt = B.getInsertPt();
2401 
2402     MachineBasicBlock &EntryMBB = B.getMF().front();
2403     EntryMBB.addLiveIn(PhyReg);
2404     B.setInsertPt(EntryMBB, EntryMBB.begin());
2405     B.buildCopy(LiveIn, PhyReg);
2406 
2407     B.setInsertPt(OrigInsBB, OrigInsPt);
2408   }
2409 
2410   return LiveIn;
2411 }
2412 
2413 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2414                                                 MachineRegisterInfo &MRI,
2415                                                 Register PhyReg, LLT Ty,
2416                                                 bool InsertLiveInCopy) const {
2417   assert(PhyReg.isPhysical() && "Physical register expected");
2418 
2419   // Get or create virtual live-in regester
2420   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2421   if (!LiveIn) {
2422     LiveIn = MRI.createGenericVirtualRegister(Ty);
2423     MRI.addLiveIn(PhyReg, LiveIn);
2424   }
2425 
2426   // When the actual true copy required is from virtual register to physical
2427   // register (to be inserted later), live-in copy insertion from physical
2428   // to register virtual register is not required
2429   if (!InsertLiveInCopy)
2430     return LiveIn;
2431 
2432   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2433 }
2434 
2435 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2436     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2437   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2438   const ArgDescriptor *Arg;
2439   const TargetRegisterClass *RC;
2440   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2441   if (!Arg) {
2442     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2443     return nullptr;
2444   }
2445   return Arg;
2446 }
2447 
2448 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2449                                          const ArgDescriptor *Arg) const {
2450   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2451     return false; // TODO: Handle these
2452 
2453   Register SrcReg = Arg->getRegister();
2454   assert(SrcReg.isPhysical() && "Physical register expected");
2455   assert(DstReg.isVirtual() && "Virtual register expected");
2456 
2457   MachineRegisterInfo &MRI = *B.getMRI();
2458 
2459   LLT Ty = MRI.getType(DstReg);
2460   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2461 
2462   if (Arg->isMasked()) {
2463     // TODO: Should we try to emit this once in the entry block?
2464     const LLT S32 = LLT::scalar(32);
2465     const unsigned Mask = Arg->getMask();
2466     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2467 
2468     Register AndMaskSrc = LiveIn;
2469 
2470     if (Shift != 0) {
2471       auto ShiftAmt = B.buildConstant(S32, Shift);
2472       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2473     }
2474 
2475     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2476   } else {
2477     B.buildCopy(DstReg, LiveIn);
2478   }
2479 
2480   return true;
2481 }
2482 
2483 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2484     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2485     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2486   B.setInstr(MI);
2487 
2488   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2489   if (!Arg)
2490     return false;
2491 
2492   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2493     return false;
2494 
2495   MI.eraseFromParent();
2496   return true;
2497 }
2498 
2499 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2500                                        MachineRegisterInfo &MRI,
2501                                        MachineIRBuilder &B) const {
2502   B.setInstr(MI);
2503   Register Dst = MI.getOperand(0).getReg();
2504   LLT DstTy = MRI.getType(Dst);
2505   LLT S16 = LLT::scalar(16);
2506   LLT S32 = LLT::scalar(32);
2507   LLT S64 = LLT::scalar(64);
2508 
2509   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2510     return true;
2511 
2512   if (DstTy == S16)
2513     return legalizeFDIV16(MI, MRI, B);
2514   if (DstTy == S32)
2515     return legalizeFDIV32(MI, MRI, B);
2516   if (DstTy == S64)
2517     return legalizeFDIV64(MI, MRI, B);
2518 
2519   return false;
2520 }
2521 
2522 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2523   const LLT S32 = LLT::scalar(32);
2524 
2525   auto Cvt0 = B.buildUITOFP(S32, Src);
2526   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2527   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2528   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2529   return B.buildFPTOUI(S32, Mul).getReg(0);
2530 }
2531 
2532 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2533                                                   Register DstReg,
2534                                                   Register Num,
2535                                                   Register Den,
2536                                                   bool IsRem) const {
2537   const LLT S1 = LLT::scalar(1);
2538   const LLT S32 = LLT::scalar(32);
2539 
2540   // RCP =  URECIP(Den) = 2^32 / Den + e
2541   // e is rounding error.
2542   auto RCP = buildDivRCP(B, Den);
2543 
2544   // RCP_LO = mul(RCP, Den)
2545   auto RCP_LO = B.buildMul(S32, RCP, Den);
2546 
2547   // RCP_HI = mulhu (RCP, Den) */
2548   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2549 
2550   // NEG_RCP_LO = -RCP_LO
2551   auto Zero = B.buildConstant(S32, 0);
2552   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2553 
2554   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2555   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2556   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2557 
2558   // Calculate the rounding error from the URECIP instruction
2559   // E = mulhu(ABS_RCP_LO, RCP)
2560   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2561 
2562   // RCP_A_E = RCP + E
2563   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2564 
2565   // RCP_S_E = RCP - E
2566   auto RCP_S_E = B.buildSub(S32, RCP, E);
2567 
2568   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2569   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2570 
2571   // Quotient = mulhu(Tmp0, Num)stmp
2572   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2573 
2574   // Num_S_Remainder = Quotient * Den
2575   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2576 
2577   // Remainder = Num - Num_S_Remainder
2578   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2579 
2580   // Remainder_GE_Den = Remainder >= Den
2581   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2582 
2583   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2584   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2585                                        Num, Num_S_Remainder);
2586 
2587   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2588   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2589 
2590   // Calculate Division result:
2591 
2592   // Quotient_A_One = Quotient + 1
2593   auto One = B.buildConstant(S32, 1);
2594   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2595 
2596   // Quotient_S_One = Quotient - 1
2597   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2598 
2599   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2600   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2601 
2602   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2603   if (IsRem) {
2604     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2605 
2606     // Calculate Rem result:
2607     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2608 
2609     // Remainder_A_Den = Remainder + Den
2610     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2611 
2612     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2613     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2614 
2615     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2616     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2617   } else {
2618     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2619   }
2620 }
2621 
2622 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2623                                               MachineRegisterInfo &MRI,
2624                                               MachineIRBuilder &B) const {
2625   B.setInstr(MI);
2626   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2627   Register DstReg = MI.getOperand(0).getReg();
2628   Register Num = MI.getOperand(1).getReg();
2629   Register Den = MI.getOperand(2).getReg();
2630   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2631   MI.eraseFromParent();
2632   return true;
2633 }
2634 
2635 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2636 //
2637 // Return lo, hi of result
2638 //
2639 // %cvt.lo = G_UITOFP Val.lo
2640 // %cvt.hi = G_UITOFP Val.hi
2641 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2642 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2643 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2644 // %mul2 = G_FMUL %mul1, 2**(-32)
2645 // %trunc = G_INTRINSIC_TRUNC %mul2
2646 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2647 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2648 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2649                                                        Register Val) {
2650   const LLT S32 = LLT::scalar(32);
2651   auto Unmerge = B.buildUnmerge(S32, Val);
2652 
2653   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2654   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2655 
2656   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2657                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2658 
2659   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2660   auto Mul1 =
2661       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2662 
2663   // 2**(-32)
2664   auto Mul2 =
2665       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2666   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2667 
2668   // -(2**32)
2669   auto Mad2 = B.buildFMAD(S32, Trunc,
2670                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2671 
2672   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2673   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2674 
2675   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2676 }
2677 
2678 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2679                                               MachineRegisterInfo &MRI,
2680                                               MachineIRBuilder &B) const {
2681   B.setInstr(MI);
2682 
2683   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2684   const LLT S32 = LLT::scalar(32);
2685   const LLT S64 = LLT::scalar(64);
2686   const LLT S1 = LLT::scalar(1);
2687   Register Numer = MI.getOperand(1).getReg();
2688   Register Denom = MI.getOperand(2).getReg();
2689   Register RcpLo, RcpHi;
2690 
2691   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2692 
2693   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2694 
2695   auto Zero64 = B.buildConstant(S64, 0);
2696   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2697 
2698   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2699   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2700 
2701   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2702   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2703   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2704 
2705   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2706   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2707   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2708   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2709 
2710   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2711   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2712   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2713   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2714   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2715 
2716   auto Zero32 = B.buildConstant(S32, 0);
2717   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2718   auto Add2_HiC =
2719       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2720   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2721   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2722 
2723   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2724   Register NumerLo = UnmergeNumer.getReg(0);
2725   Register NumerHi = UnmergeNumer.getReg(1);
2726 
2727   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2728   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2729   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2730   Register Mul3_Lo = UnmergeMul3.getReg(0);
2731   Register Mul3_Hi = UnmergeMul3.getReg(1);
2732   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2733   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2734   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2735   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2736 
2737   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2738   Register DenomLo = UnmergeDenom.getReg(0);
2739   Register DenomHi = UnmergeDenom.getReg(1);
2740 
2741   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2742   auto C1 = B.buildSExt(S32, CmpHi);
2743 
2744   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2745   auto C2 = B.buildSExt(S32, CmpLo);
2746 
2747   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2748   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2749 
2750   // TODO: Here and below portions of the code can be enclosed into if/endif.
2751   // Currently control flow is unconditional and we have 4 selects after
2752   // potential endif to substitute PHIs.
2753 
2754   // if C3 != 0 ...
2755   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2756   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2757   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2758   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2759 
2760   auto One64 = B.buildConstant(S64, 1);
2761   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2762 
2763   auto C4 =
2764       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2765   auto C5 =
2766       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2767   auto C6 = B.buildSelect(
2768       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2769 
2770   // if (C6 != 0)
2771   auto Add4 = B.buildAdd(S64, Add3, One64);
2772   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2773 
2774   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2775   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2776   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2777 
2778   // endif C6
2779   // endif C3
2780 
2781   if (IsDiv) {
2782     auto Sel1 = B.buildSelect(
2783         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2784     B.buildSelect(MI.getOperand(0),
2785                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2786   } else {
2787     auto Sel2 = B.buildSelect(
2788         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2789     B.buildSelect(MI.getOperand(0),
2790                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2791   }
2792 
2793   MI.eraseFromParent();
2794   return true;
2795 }
2796 
2797 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2798                                             MachineRegisterInfo &MRI,
2799                                             MachineIRBuilder &B) const {
2800   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2801   if (Ty == LLT::scalar(32))
2802     return legalizeUDIV_UREM32(MI, MRI, B);
2803   if (Ty == LLT::scalar(64))
2804     return legalizeUDIV_UREM64(MI, MRI, B);
2805   return false;
2806 }
2807 
2808 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2809                                               MachineRegisterInfo &MRI,
2810                                               MachineIRBuilder &B) const {
2811   B.setInstr(MI);
2812   const LLT S32 = LLT::scalar(32);
2813 
2814   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2815   Register DstReg = MI.getOperand(0).getReg();
2816   Register LHS = MI.getOperand(1).getReg();
2817   Register RHS = MI.getOperand(2).getReg();
2818 
2819   auto ThirtyOne = B.buildConstant(S32, 31);
2820   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2821   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2822 
2823   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2824   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2825 
2826   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2827   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2828 
2829   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2830   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2831 
2832   if (IsRem) {
2833     auto RSign = LHSign; // Remainder sign is the same as LHS
2834     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2835     B.buildSub(DstReg, UDivRem, RSign);
2836   } else {
2837     auto DSign = B.buildXor(S32, LHSign, RHSign);
2838     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2839     B.buildSub(DstReg, UDivRem, DSign);
2840   }
2841 
2842   MI.eraseFromParent();
2843   return true;
2844 }
2845 
2846 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2847                                             MachineRegisterInfo &MRI,
2848                                             MachineIRBuilder &B) const {
2849   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2850     return legalizeSDIV_SREM32(MI, MRI, B);
2851   return false;
2852 }
2853 
2854 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2855                                                  MachineRegisterInfo &MRI,
2856                                                  MachineIRBuilder &B) const {
2857   Register Res = MI.getOperand(0).getReg();
2858   Register LHS = MI.getOperand(1).getReg();
2859   Register RHS = MI.getOperand(2).getReg();
2860 
2861   uint16_t Flags = MI.getFlags();
2862 
2863   LLT ResTy = MRI.getType(Res);
2864   LLT S32 = LLT::scalar(32);
2865   LLT S64 = LLT::scalar(64);
2866 
2867   const MachineFunction &MF = B.getMF();
2868   bool Unsafe =
2869     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2870 
2871   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2872     return false;
2873 
2874   if (!Unsafe && ResTy == S32 &&
2875       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2876     return false;
2877 
2878   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2879     // 1 / x -> RCP(x)
2880     if (CLHS->isExactlyValue(1.0)) {
2881       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2882         .addUse(RHS)
2883         .setMIFlags(Flags);
2884 
2885       MI.eraseFromParent();
2886       return true;
2887     }
2888 
2889     // -1 / x -> RCP( FNEG(x) )
2890     if (CLHS->isExactlyValue(-1.0)) {
2891       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2892       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2893         .addUse(FNeg.getReg(0))
2894         .setMIFlags(Flags);
2895 
2896       MI.eraseFromParent();
2897       return true;
2898     }
2899   }
2900 
2901   // x / y -> x * (1.0 / y)
2902   if (Unsafe) {
2903     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2904       .addUse(RHS)
2905       .setMIFlags(Flags);
2906     B.buildFMul(Res, LHS, RCP, Flags);
2907 
2908     MI.eraseFromParent();
2909     return true;
2910   }
2911 
2912   return false;
2913 }
2914 
2915 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2916                                          MachineRegisterInfo &MRI,
2917                                          MachineIRBuilder &B) const {
2918   B.setInstr(MI);
2919   Register Res = MI.getOperand(0).getReg();
2920   Register LHS = MI.getOperand(1).getReg();
2921   Register RHS = MI.getOperand(2).getReg();
2922 
2923   uint16_t Flags = MI.getFlags();
2924 
2925   LLT S16 = LLT::scalar(16);
2926   LLT S32 = LLT::scalar(32);
2927 
2928   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2929   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2930 
2931   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2932     .addUse(RHSExt.getReg(0))
2933     .setMIFlags(Flags);
2934 
2935   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2936   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2937 
2938   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2939     .addUse(RDst.getReg(0))
2940     .addUse(RHS)
2941     .addUse(LHS)
2942     .setMIFlags(Flags);
2943 
2944   MI.eraseFromParent();
2945   return true;
2946 }
2947 
2948 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2949 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2950 static void toggleSPDenormMode(bool Enable,
2951                                MachineIRBuilder &B,
2952                                const GCNSubtarget &ST,
2953                                AMDGPU::SIModeRegisterDefaults Mode) {
2954   // Set SP denorm mode to this value.
2955   unsigned SPDenormMode =
2956     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2957 
2958   if (ST.hasDenormModeInst()) {
2959     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2960     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2961 
2962     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2963     B.buildInstr(AMDGPU::S_DENORM_MODE)
2964       .addImm(NewDenormModeValue);
2965 
2966   } else {
2967     // Select FP32 bit field in mode register.
2968     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2969                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2970                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2971 
2972     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2973       .addImm(SPDenormMode)
2974       .addImm(SPDenormModeBitField);
2975   }
2976 }
2977 
2978 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2979                                          MachineRegisterInfo &MRI,
2980                                          MachineIRBuilder &B) const {
2981   B.setInstr(MI);
2982   Register Res = MI.getOperand(0).getReg();
2983   Register LHS = MI.getOperand(1).getReg();
2984   Register RHS = MI.getOperand(2).getReg();
2985   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2986   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2987 
2988   uint16_t Flags = MI.getFlags();
2989 
2990   LLT S32 = LLT::scalar(32);
2991   LLT S1 = LLT::scalar(1);
2992 
2993   auto One = B.buildFConstant(S32, 1.0f);
2994 
2995   auto DenominatorScaled =
2996     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2997       .addUse(LHS)
2998       .addUse(RHS)
2999       .addImm(0)
3000       .setMIFlags(Flags);
3001   auto NumeratorScaled =
3002     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3003       .addUse(LHS)
3004       .addUse(RHS)
3005       .addImm(1)
3006       .setMIFlags(Flags);
3007 
3008   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3009     .addUse(DenominatorScaled.getReg(0))
3010     .setMIFlags(Flags);
3011   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3012 
3013   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3014   // aren't modeled as reading it.
3015   if (!Mode.allFP32Denormals())
3016     toggleSPDenormMode(true, B, ST, Mode);
3017 
3018   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3019   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3020   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3021   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3022   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3023   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3024 
3025   if (!Mode.allFP32Denormals())
3026     toggleSPDenormMode(false, B, ST, Mode);
3027 
3028   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3029     .addUse(Fma4.getReg(0))
3030     .addUse(Fma1.getReg(0))
3031     .addUse(Fma3.getReg(0))
3032     .addUse(NumeratorScaled.getReg(1))
3033     .setMIFlags(Flags);
3034 
3035   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3036     .addUse(Fmas.getReg(0))
3037     .addUse(RHS)
3038     .addUse(LHS)
3039     .setMIFlags(Flags);
3040 
3041   MI.eraseFromParent();
3042   return true;
3043 }
3044 
3045 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3046                                          MachineRegisterInfo &MRI,
3047                                          MachineIRBuilder &B) const {
3048   B.setInstr(MI);
3049   Register Res = MI.getOperand(0).getReg();
3050   Register LHS = MI.getOperand(1).getReg();
3051   Register RHS = MI.getOperand(2).getReg();
3052 
3053   uint16_t Flags = MI.getFlags();
3054 
3055   LLT S64 = LLT::scalar(64);
3056   LLT S1 = LLT::scalar(1);
3057 
3058   auto One = B.buildFConstant(S64, 1.0);
3059 
3060   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3061     .addUse(LHS)
3062     .addUse(RHS)
3063     .addImm(0)
3064     .setMIFlags(Flags);
3065 
3066   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3067 
3068   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3069     .addUse(DivScale0.getReg(0))
3070     .setMIFlags(Flags);
3071 
3072   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3073   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3074   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3075 
3076   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3077     .addUse(LHS)
3078     .addUse(RHS)
3079     .addImm(1)
3080     .setMIFlags(Flags);
3081 
3082   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3083   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3084   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3085 
3086   Register Scale;
3087   if (!ST.hasUsableDivScaleConditionOutput()) {
3088     // Workaround a hardware bug on SI where the condition output from div_scale
3089     // is not usable.
3090 
3091     LLT S32 = LLT::scalar(32);
3092 
3093     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3094     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3095     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3096     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3097 
3098     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3099                               Scale1Unmerge.getReg(1));
3100     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3101                               Scale0Unmerge.getReg(1));
3102     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3103   } else {
3104     Scale = DivScale1.getReg(1);
3105   }
3106 
3107   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3108     .addUse(Fma4.getReg(0))
3109     .addUse(Fma3.getReg(0))
3110     .addUse(Mul.getReg(0))
3111     .addUse(Scale)
3112     .setMIFlags(Flags);
3113 
3114   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3115     .addUse(Fmas.getReg(0))
3116     .addUse(RHS)
3117     .addUse(LHS)
3118     .setMIFlags(Flags);
3119 
3120   MI.eraseFromParent();
3121   return true;
3122 }
3123 
3124 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3125                                                  MachineRegisterInfo &MRI,
3126                                                  MachineIRBuilder &B) const {
3127   B.setInstr(MI);
3128   Register Res = MI.getOperand(0).getReg();
3129   Register LHS = MI.getOperand(2).getReg();
3130   Register RHS = MI.getOperand(3).getReg();
3131   uint16_t Flags = MI.getFlags();
3132 
3133   LLT S32 = LLT::scalar(32);
3134   LLT S1 = LLT::scalar(1);
3135 
3136   auto Abs = B.buildFAbs(S32, RHS, Flags);
3137   const APFloat C0Val(1.0f);
3138 
3139   auto C0 = B.buildConstant(S32, 0x6f800000);
3140   auto C1 = B.buildConstant(S32, 0x2f800000);
3141   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3142 
3143   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3144   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3145 
3146   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3147 
3148   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3149     .addUse(Mul0.getReg(0))
3150     .setMIFlags(Flags);
3151 
3152   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3153 
3154   B.buildFMul(Res, Sel, Mul1, Flags);
3155 
3156   MI.eraseFromParent();
3157   return true;
3158 }
3159 
3160 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3161                                                  MachineRegisterInfo &MRI,
3162                                                  MachineIRBuilder &B) const {
3163   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3164   if (!MFI->isEntryFunction()) {
3165     return legalizePreloadedArgIntrin(MI, MRI, B,
3166                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3167   }
3168 
3169   B.setInstr(MI);
3170 
3171   uint64_t Offset =
3172     ST.getTargetLowering()->getImplicitParameterOffset(
3173       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3174   Register DstReg = MI.getOperand(0).getReg();
3175   LLT DstTy = MRI.getType(DstReg);
3176   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3177 
3178   const ArgDescriptor *Arg;
3179   const TargetRegisterClass *RC;
3180   std::tie(Arg, RC)
3181     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3182   if (!Arg)
3183     return false;
3184 
3185   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3186   if (!loadInputValue(KernargPtrReg, B, Arg))
3187     return false;
3188 
3189   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3190   MI.eraseFromParent();
3191   return true;
3192 }
3193 
3194 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3195                                               MachineRegisterInfo &MRI,
3196                                               MachineIRBuilder &B,
3197                                               unsigned AddrSpace) const {
3198   B.setInstr(MI);
3199   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3200   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3201   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3202   MI.eraseFromParent();
3203   return true;
3204 }
3205 
3206 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3207 // offset (the offset that is included in bounds checking and swizzling, to be
3208 // split between the instruction's voffset and immoffset fields) and soffset
3209 // (the offset that is excluded from bounds checking and swizzling, to go in
3210 // the instruction's soffset field).  This function takes the first kind of
3211 // offset and figures out how to split it between voffset and immoffset.
3212 std::tuple<Register, unsigned, unsigned>
3213 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3214                                         Register OrigOffset) const {
3215   const unsigned MaxImm = 4095;
3216   Register BaseReg;
3217   unsigned TotalConstOffset;
3218   MachineInstr *OffsetDef;
3219   const LLT S32 = LLT::scalar(32);
3220 
3221   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3222     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3223 
3224   unsigned ImmOffset = TotalConstOffset;
3225 
3226   // If the immediate value is too big for the immoffset field, put the value
3227   // and -4096 into the immoffset field so that the value that is copied/added
3228   // for the voffset field is a multiple of 4096, and it stands more chance
3229   // of being CSEd with the copy/add for another similar load/store.
3230   // However, do not do that rounding down to a multiple of 4096 if that is a
3231   // negative number, as it appears to be illegal to have a negative offset
3232   // in the vgpr, even if adding the immediate offset makes it positive.
3233   unsigned Overflow = ImmOffset & ~MaxImm;
3234   ImmOffset -= Overflow;
3235   if ((int32_t)Overflow < 0) {
3236     Overflow += ImmOffset;
3237     ImmOffset = 0;
3238   }
3239 
3240   if (Overflow != 0) {
3241     if (!BaseReg) {
3242       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3243     } else {
3244       auto OverflowVal = B.buildConstant(S32, Overflow);
3245       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3246     }
3247   }
3248 
3249   if (!BaseReg)
3250     BaseReg = B.buildConstant(S32, 0).getReg(0);
3251 
3252   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3253 }
3254 
3255 /// Handle register layout difference for f16 images for some subtargets.
3256 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3257                                              MachineRegisterInfo &MRI,
3258                                              Register Reg) const {
3259   if (!ST.hasUnpackedD16VMem())
3260     return Reg;
3261 
3262   const LLT S16 = LLT::scalar(16);
3263   const LLT S32 = LLT::scalar(32);
3264   LLT StoreVT = MRI.getType(Reg);
3265   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3266 
3267   auto Unmerge = B.buildUnmerge(S16, Reg);
3268 
3269   SmallVector<Register, 4> WideRegs;
3270   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3271     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3272 
3273   int NumElts = StoreVT.getNumElements();
3274 
3275   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3276 }
3277 
3278 Register AMDGPULegalizerInfo::fixStoreSourceType(
3279   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3280   MachineRegisterInfo *MRI = B.getMRI();
3281   LLT Ty = MRI->getType(VData);
3282 
3283   const LLT S16 = LLT::scalar(16);
3284 
3285   // Fixup illegal register types for i8 stores.
3286   if (Ty == LLT::scalar(8) || Ty == S16) {
3287     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3288     return AnyExt;
3289   }
3290 
3291   if (Ty.isVector()) {
3292     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3293       if (IsFormat)
3294         return handleD16VData(B, *MRI, VData);
3295     }
3296   }
3297 
3298   return VData;
3299 }
3300 
3301 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3302                                               MachineRegisterInfo &MRI,
3303                                               MachineIRBuilder &B,
3304                                               bool IsTyped,
3305                                               bool IsFormat) const {
3306   B.setInstr(MI);
3307 
3308   Register VData = MI.getOperand(1).getReg();
3309   LLT Ty = MRI.getType(VData);
3310   LLT EltTy = Ty.getScalarType();
3311   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3312   const LLT S32 = LLT::scalar(32);
3313 
3314   VData = fixStoreSourceType(B, VData, IsFormat);
3315   Register RSrc = MI.getOperand(2).getReg();
3316 
3317   MachineMemOperand *MMO = *MI.memoperands_begin();
3318   const int MemSize = MMO->getSize();
3319 
3320   unsigned ImmOffset;
3321   unsigned TotalOffset;
3322 
3323   // The typed intrinsics add an immediate after the registers.
3324   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3325 
3326   // The struct intrinsic variants add one additional operand over raw.
3327   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3328   Register VIndex;
3329   int OpOffset = 0;
3330   if (HasVIndex) {
3331     VIndex = MI.getOperand(3).getReg();
3332     OpOffset = 1;
3333   }
3334 
3335   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3336   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3337 
3338   unsigned Format = 0;
3339   if (IsTyped) {
3340     Format = MI.getOperand(5 + OpOffset).getImm();
3341     ++OpOffset;
3342   }
3343 
3344   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3345 
3346   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3347   if (TotalOffset != 0)
3348     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3349 
3350   unsigned Opc;
3351   if (IsTyped) {
3352     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3353                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3354   } else if (IsFormat) {
3355     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3356                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3357   } else {
3358     switch (MemSize) {
3359     case 1:
3360       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3361       break;
3362     case 2:
3363       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3364       break;
3365     default:
3366       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3367       break;
3368     }
3369   }
3370 
3371   if (!VIndex)
3372     VIndex = B.buildConstant(S32, 0).getReg(0);
3373 
3374   auto MIB = B.buildInstr(Opc)
3375     .addUse(VData)              // vdata
3376     .addUse(RSrc)               // rsrc
3377     .addUse(VIndex)             // vindex
3378     .addUse(VOffset)            // voffset
3379     .addUse(SOffset)            // soffset
3380     .addImm(ImmOffset);         // offset(imm)
3381 
3382   if (IsTyped)
3383     MIB.addImm(Format);
3384 
3385   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3386      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3387      .addMemOperand(MMO);
3388 
3389   MI.eraseFromParent();
3390   return true;
3391 }
3392 
3393 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3394                                              MachineRegisterInfo &MRI,
3395                                              MachineIRBuilder &B,
3396                                              bool IsFormat,
3397                                              bool IsTyped) const {
3398   B.setInstr(MI);
3399 
3400   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3401   MachineMemOperand *MMO = *MI.memoperands_begin();
3402   const int MemSize = MMO->getSize();
3403   const LLT S32 = LLT::scalar(32);
3404 
3405   Register Dst = MI.getOperand(0).getReg();
3406   Register RSrc = MI.getOperand(2).getReg();
3407 
3408   // The typed intrinsics add an immediate after the registers.
3409   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3410 
3411   // The struct intrinsic variants add one additional operand over raw.
3412   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3413   Register VIndex;
3414   int OpOffset = 0;
3415   if (HasVIndex) {
3416     VIndex = MI.getOperand(3).getReg();
3417     OpOffset = 1;
3418   }
3419 
3420   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3421   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3422 
3423   unsigned Format = 0;
3424   if (IsTyped) {
3425     Format = MI.getOperand(5 + OpOffset).getImm();
3426     ++OpOffset;
3427   }
3428 
3429   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3430   unsigned ImmOffset;
3431   unsigned TotalOffset;
3432 
3433   LLT Ty = MRI.getType(Dst);
3434   LLT EltTy = Ty.getScalarType();
3435   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3436   const bool Unpacked = ST.hasUnpackedD16VMem();
3437 
3438   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3439   if (TotalOffset != 0)
3440     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3441 
3442   unsigned Opc;
3443 
3444   if (IsTyped) {
3445     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3446                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3447   } else if (IsFormat) {
3448     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3449                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3450   } else {
3451     switch (MemSize) {
3452     case 1:
3453       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3454       break;
3455     case 2:
3456       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3457       break;
3458     default:
3459       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3460       break;
3461     }
3462   }
3463 
3464   Register LoadDstReg;
3465 
3466   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3467   LLT UnpackedTy = Ty.changeElementSize(32);
3468 
3469   if (IsExtLoad)
3470     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3471   else if (Unpacked && IsD16 && Ty.isVector())
3472     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3473   else
3474     LoadDstReg = Dst;
3475 
3476   if (!VIndex)
3477     VIndex = B.buildConstant(S32, 0).getReg(0);
3478 
3479   auto MIB = B.buildInstr(Opc)
3480     .addDef(LoadDstReg)         // vdata
3481     .addUse(RSrc)               // rsrc
3482     .addUse(VIndex)             // vindex
3483     .addUse(VOffset)            // voffset
3484     .addUse(SOffset)            // soffset
3485     .addImm(ImmOffset);         // offset(imm)
3486 
3487   if (IsTyped)
3488     MIB.addImm(Format);
3489 
3490   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3491      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3492      .addMemOperand(MMO);
3493 
3494   if (LoadDstReg != Dst) {
3495     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3496 
3497     // Widen result for extending loads was widened.
3498     if (IsExtLoad)
3499       B.buildTrunc(Dst, LoadDstReg);
3500     else {
3501       // Repack to original 16-bit vector result
3502       // FIXME: G_TRUNC should work, but legalization currently fails
3503       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3504       SmallVector<Register, 4> Repack;
3505       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3506         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3507       B.buildMerge(Dst, Repack);
3508     }
3509   }
3510 
3511   MI.eraseFromParent();
3512   return true;
3513 }
3514 
3515 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3516                                                MachineIRBuilder &B,
3517                                                bool IsInc) const {
3518   B.setInstr(MI);
3519   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3520                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3521   B.buildInstr(Opc)
3522     .addDef(MI.getOperand(0).getReg())
3523     .addUse(MI.getOperand(2).getReg())
3524     .addUse(MI.getOperand(3).getReg())
3525     .cloneMemRefs(MI);
3526   MI.eraseFromParent();
3527   return true;
3528 }
3529 
3530 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3531   switch (IntrID) {
3532   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3533   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3534     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3535   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3536   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3537     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3538   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3539   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3540     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3541   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3542   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3543     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3544   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3545   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3546     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3547   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3548   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3549     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3550   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3551   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3552     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3553   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3554   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3555     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3556   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3557   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3558     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3559   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3560   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3561     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3562   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3563   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3564     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3565   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3566   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3567     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3568   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3569   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3570     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3571   default:
3572     llvm_unreachable("unhandled atomic opcode");
3573   }
3574 }
3575 
3576 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3577                                                MachineIRBuilder &B,
3578                                                Intrinsic::ID IID) const {
3579   B.setInstr(MI);
3580 
3581   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3582                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3583 
3584   Register Dst = MI.getOperand(0).getReg();
3585   Register VData = MI.getOperand(2).getReg();
3586 
3587   Register CmpVal;
3588   int OpOffset = 0;
3589 
3590   if (IsCmpSwap) {
3591     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3592     ++OpOffset;
3593   }
3594 
3595   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3596   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3597 
3598   // The struct intrinsic variants add one additional operand over raw.
3599   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3600   Register VIndex;
3601   if (HasVIndex) {
3602     VIndex = MI.getOperand(4 + OpOffset).getReg();
3603     ++OpOffset;
3604   }
3605 
3606   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3607   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3608   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3609 
3610   MachineMemOperand *MMO = *MI.memoperands_begin();
3611 
3612   unsigned ImmOffset;
3613   unsigned TotalOffset;
3614   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3615   if (TotalOffset != 0)
3616     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3617 
3618   if (!VIndex)
3619     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3620 
3621   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3622     .addDef(Dst)
3623     .addUse(VData); // vdata
3624 
3625   if (IsCmpSwap)
3626     MIB.addReg(CmpVal);
3627 
3628   MIB.addUse(RSrc)               // rsrc
3629      .addUse(VIndex)             // vindex
3630      .addUse(VOffset)            // voffset
3631      .addUse(SOffset)            // soffset
3632      .addImm(ImmOffset)          // offset(imm)
3633      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3634      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3635      .addMemOperand(MMO);
3636 
3637   MI.eraseFromParent();
3638   return true;
3639 }
3640 
3641 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3642 /// vector with s16 typed elements.
3643 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3644                                         SmallVectorImpl<Register> &PackedAddrs,
3645                                         int AddrIdx, int DimIdx, int NumVAddrs,
3646                                         int NumGradients) {
3647   const LLT S16 = LLT::scalar(16);
3648   const LLT V2S16 = LLT::vector(2, 16);
3649 
3650   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3651     MachineOperand &SrcOp = MI.getOperand(I);
3652     if (!SrcOp.isReg())
3653       continue; // _L to _LZ may have eliminated this.
3654 
3655     Register AddrReg = SrcOp.getReg();
3656 
3657     if (I < DimIdx) {
3658       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3659       PackedAddrs.push_back(AddrReg);
3660     } else {
3661       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3662       // derivatives dx/dh and dx/dv are packed with undef.
3663       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3664           ((NumGradients / 2) % 2 == 1 &&
3665            (I == DimIdx + (NumGradients / 2) - 1 ||
3666             I == DimIdx + NumGradients - 1)) ||
3667           // Check for _L to _LZ optimization
3668           !MI.getOperand(I + 1).isReg()) {
3669         PackedAddrs.push_back(
3670             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3671                 .getReg(0));
3672       } else {
3673         PackedAddrs.push_back(
3674             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3675                 .getReg(0));
3676         ++I;
3677       }
3678     }
3679   }
3680 }
3681 
3682 /// Convert from separate vaddr components to a single vector address register,
3683 /// and replace the remaining operands with $noreg.
3684 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3685                                      int DimIdx, int NumVAddrs) {
3686   const LLT S32 = LLT::scalar(32);
3687 
3688   SmallVector<Register, 8> AddrRegs;
3689   for (int I = 0; I != NumVAddrs; ++I) {
3690     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3691     if (SrcOp.isReg()) {
3692       AddrRegs.push_back(SrcOp.getReg());
3693       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3694     }
3695   }
3696 
3697   int NumAddrRegs = AddrRegs.size();
3698   if (NumAddrRegs != 1) {
3699     // Round up to 8 elements for v5-v7
3700     // FIXME: Missing intermediate sized register classes and instructions.
3701     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3702       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3703       auto Undef = B.buildUndef(S32);
3704       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3705       NumAddrRegs = RoundedNumRegs;
3706     }
3707 
3708     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3709     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3710   }
3711 
3712   for (int I = 1; I != NumVAddrs; ++I) {
3713     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3714     if (SrcOp.isReg())
3715       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3716   }
3717 }
3718 
3719 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3720 ///
3721 /// Depending on the subtarget, load/store with 16-bit element data need to be
3722 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3723 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3724 /// registers.
3725 ///
3726 /// We don't want to directly select image instructions just yet, but also want
3727 /// to exposes all register repacking to the legalizer/combiners. We also don't
3728 /// want a selected instrution entering RegBankSelect. In order to avoid
3729 /// defining a multitude of intermediate image instructions, directly hack on
3730 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3731 /// now unnecessary arguments with $noreg.
3732 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3733     MachineInstr &MI, MachineIRBuilder &B,
3734     GISelChangeObserver &Observer,
3735     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3736   B.setInstr(MI);
3737 
3738   const int NumDefs = MI.getNumExplicitDefs();
3739   bool IsTFE = NumDefs == 2;
3740   // We are only processing the operands of d16 image operations on subtargets
3741   // that use the unpacked register layout, or need to repack the TFE result.
3742 
3743   // TODO: Do we need to guard against already legalized intrinsics?
3744   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3745     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3746 
3747   MachineRegisterInfo *MRI = B.getMRI();
3748   const LLT S32 = LLT::scalar(32);
3749   const LLT S16 = LLT::scalar(16);
3750   const LLT V2S16 = LLT::vector(2, 16);
3751 
3752   // Index of first address argument
3753   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3754 
3755   // Check for 16 bit addresses and pack if true.
3756   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3757   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3758   const bool IsA16 = AddrTy == S16;
3759 
3760   int NumVAddrs, NumGradients;
3761   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3762   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3763     getDMaskIdx(BaseOpcode, NumDefs);
3764   unsigned DMask = 0;
3765 
3766   int DMaskLanes = 0;
3767   if (!BaseOpcode->Atomic) {
3768     DMask = MI.getOperand(DMaskIdx).getImm();
3769     if (BaseOpcode->Gather4) {
3770       DMaskLanes = 4;
3771     } else if (DMask != 0) {
3772       DMaskLanes = countPopulation(DMask);
3773     } else if (!IsTFE && !BaseOpcode->Store) {
3774       // If dmask is 0, this is a no-op load. This can be eliminated.
3775       B.buildUndef(MI.getOperand(0));
3776       MI.eraseFromParent();
3777       return true;
3778     }
3779   }
3780 
3781   Observer.changingInstr(MI);
3782   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3783 
3784   unsigned NewOpcode = NumDefs == 0 ?
3785     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3786 
3787   // Track that we legalized this
3788   MI.setDesc(B.getTII().get(NewOpcode));
3789 
3790   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3791   // dmask to be at least 1 otherwise the instruction will fail
3792   if (IsTFE && DMask == 0) {
3793     DMask = 0x1;
3794     DMaskLanes = 1;
3795     MI.getOperand(DMaskIdx).setImm(DMask);
3796   }
3797 
3798   if (BaseOpcode->Atomic) {
3799     Register VData0 = MI.getOperand(2).getReg();
3800     LLT Ty = MRI->getType(VData0);
3801 
3802     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3803     if (Ty.isVector())
3804       return false;
3805 
3806     if (BaseOpcode->AtomicX2) {
3807       Register VData1 = MI.getOperand(3).getReg();
3808       // The two values are packed in one register.
3809       LLT PackedTy = LLT::vector(2, Ty);
3810       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3811       MI.getOperand(2).setReg(Concat.getReg(0));
3812       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3813     }
3814   }
3815 
3816   int CorrectedNumVAddrs = NumVAddrs;
3817 
3818   // Optimize _L to _LZ when _L is zero
3819   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3820         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3821     const ConstantFP *ConstantLod;
3822     const int LodIdx = AddrIdx + NumVAddrs - 1;
3823 
3824     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3825       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3826         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3827         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3828           LZMappingInfo->LZ, ImageDimIntr->Dim);
3829 
3830         // The starting indexes should remain in the same place.
3831         --NumVAddrs;
3832         --CorrectedNumVAddrs;
3833 
3834         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3835           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3836         MI.RemoveOperand(LodIdx);
3837       }
3838     }
3839   }
3840 
3841   // Optimize _mip away, when 'lod' is zero
3842   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3843     int64_t ConstantLod;
3844     const int LodIdx = AddrIdx + NumVAddrs - 1;
3845 
3846     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3847       if (ConstantLod == 0) {
3848         // TODO: Change intrinsic opcode and remove operand instead or replacing
3849         // it with 0, as the _L to _LZ handling is done above.
3850         MI.getOperand(LodIdx).ChangeToImmediate(0);
3851         --CorrectedNumVAddrs;
3852       }
3853     }
3854   }
3855 
3856   // If the register allocator cannot place the address registers contiguously
3857   // without introducing moves, then using the non-sequential address encoding
3858   // is always preferable, since it saves VALU instructions and is usually a
3859   // wash in terms of code size or even better.
3860   //
3861   // However, we currently have no way of hinting to the register allocator
3862   // that MIMG addresses should be placed contiguously when it is possible to
3863   // do so, so force non-NSA for the common 2-address case as a heuristic.
3864   //
3865   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3866   // allocation when possible.
3867   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3868 
3869   // Rewrite the addressing register layout before doing anything else.
3870   if (IsA16) {
3871     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3872     // should be introduced.
3873     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3874       return false;
3875 
3876     if (NumVAddrs > 1) {
3877       SmallVector<Register, 4> PackedRegs;
3878       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3879                                   NumGradients);
3880 
3881       if (!UseNSA && PackedRegs.size() > 1) {
3882         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3883         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3884         PackedRegs[0] = Concat.getReg(0);
3885         PackedRegs.resize(1);
3886       }
3887 
3888       const int NumPacked = PackedRegs.size();
3889       for (int I = 0; I != NumVAddrs; ++I) {
3890         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3891         if (!SrcOp.isReg()) {
3892           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3893           continue;
3894         }
3895 
3896         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3897 
3898         if (I < NumPacked)
3899           SrcOp.setReg(PackedRegs[I]);
3900         else
3901           SrcOp.setReg(AMDGPU::NoRegister);
3902       }
3903     }
3904   } else if (!UseNSA && NumVAddrs > 1) {
3905     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3906   }
3907 
3908 
3909   if (BaseOpcode->Store) { // No TFE for stores?
3910     // TODO: Handle dmask trim
3911     Register VData = MI.getOperand(1).getReg();
3912     LLT Ty = MRI->getType(VData);
3913     if (!Ty.isVector() || Ty.getElementType() != S16)
3914       return true;
3915 
3916     B.setInstr(MI);
3917 
3918     Register RepackedReg = handleD16VData(B, *MRI, VData);
3919     if (RepackedReg != VData) {
3920       MI.getOperand(1).setReg(RepackedReg);
3921     }
3922 
3923     return true;
3924   }
3925 
3926   Register DstReg = MI.getOperand(0).getReg();
3927   LLT Ty = MRI->getType(DstReg);
3928   const LLT EltTy = Ty.getScalarType();
3929   const bool IsD16 = Ty.getScalarType() == S16;
3930   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3931 
3932   // Confirm that the return type is large enough for the dmask specified
3933   if (NumElts < DMaskLanes)
3934     return false;
3935 
3936   if (NumElts > 4 || DMaskLanes > 4)
3937     return false;
3938 
3939   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3940   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3941 
3942   // The raw dword aligned data component of the load. The only legal cases
3943   // where this matters should be when using the packed D16 format, for
3944   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3945   LLT RoundedTy;
3946 
3947   // S32 vector to to cover all data, plus TFE result element.
3948   LLT TFETy;
3949 
3950   // Register type to use for each loaded component. Will be S32 or V2S16.
3951   LLT RegTy;
3952 
3953   if (IsD16 && ST.hasUnpackedD16VMem()) {
3954     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3955     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3956     RegTy = S32;
3957   } else {
3958     unsigned EltSize = EltTy.getSizeInBits();
3959     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3960     unsigned RoundedSize = 32 * RoundedElts;
3961     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3962     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3963     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3964   }
3965 
3966   // The return type does not need adjustment.
3967   // TODO: Should we change s16 case to s32 or <2 x s16>?
3968   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3969     return true;
3970 
3971   Register Dst1Reg;
3972 
3973   // Insert after the instruction.
3974   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3975 
3976   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3977   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3978   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3979   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3980 
3981   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3982 
3983   MI.getOperand(0).setReg(NewResultReg);
3984 
3985   // In the IR, TFE is supposed to be used with a 2 element struct return
3986   // type. The intruction really returns these two values in one contiguous
3987   // register, with one additional dword beyond the loaded data. Rewrite the
3988   // return type to use a single register result.
3989 
3990   if (IsTFE) {
3991     Dst1Reg = MI.getOperand(1).getReg();
3992     if (MRI->getType(Dst1Reg) != S32)
3993       return false;
3994 
3995     // TODO: Make sure the TFE operand bit is set.
3996     MI.RemoveOperand(1);
3997 
3998     // Handle the easy case that requires no repack instructions.
3999     if (Ty == S32) {
4000       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4001       return true;
4002     }
4003   }
4004 
4005   // Now figure out how to copy the new result register back into the old
4006   // result.
4007   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4008 
4009   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4010 
4011   if (ResultNumRegs == 1) {
4012     assert(!IsTFE);
4013     ResultRegs[0] = NewResultReg;
4014   } else {
4015     // We have to repack into a new vector of some kind.
4016     for (int I = 0; I != NumDataRegs; ++I)
4017       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4018     B.buildUnmerge(ResultRegs, NewResultReg);
4019 
4020     // Drop the final TFE element to get the data part. The TFE result is
4021     // directly written to the right place already.
4022     if (IsTFE)
4023       ResultRegs.resize(NumDataRegs);
4024   }
4025 
4026   // For an s16 scalar result, we form an s32 result with a truncate regardless
4027   // of packed vs. unpacked.
4028   if (IsD16 && !Ty.isVector()) {
4029     B.buildTrunc(DstReg, ResultRegs[0]);
4030     return true;
4031   }
4032 
4033   // Avoid a build/concat_vector of 1 entry.
4034   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4035     B.buildBitcast(DstReg, ResultRegs[0]);
4036     return true;
4037   }
4038 
4039   assert(Ty.isVector());
4040 
4041   if (IsD16) {
4042     // For packed D16 results with TFE enabled, all the data components are
4043     // S32. Cast back to the expected type.
4044     //
4045     // TODO: We don't really need to use load s32 elements. We would only need one
4046     // cast for the TFE result if a multiple of v2s16 was used.
4047     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4048       for (Register &Reg : ResultRegs)
4049         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4050     } else if (ST.hasUnpackedD16VMem()) {
4051       for (Register &Reg : ResultRegs)
4052         Reg = B.buildTrunc(S16, Reg).getReg(0);
4053     }
4054   }
4055 
4056   auto padWithUndef = [&](LLT Ty, int NumElts) {
4057     if (NumElts == 0)
4058       return;
4059     Register Undef = B.buildUndef(Ty).getReg(0);
4060     for (int I = 0; I != NumElts; ++I)
4061       ResultRegs.push_back(Undef);
4062   };
4063 
4064   // Pad out any elements eliminated due to the dmask.
4065   LLT ResTy = MRI->getType(ResultRegs[0]);
4066   if (!ResTy.isVector()) {
4067     padWithUndef(ResTy, NumElts - ResultRegs.size());
4068     B.buildBuildVector(DstReg, ResultRegs);
4069     return true;
4070   }
4071 
4072   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4073   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4074 
4075   // Deal with the one annoying legal case.
4076   const LLT V3S16 = LLT::vector(3, 16);
4077   if (Ty == V3S16) {
4078     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4079     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4080     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4081     return true;
4082   }
4083 
4084   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4085   B.buildConcatVectors(DstReg, ResultRegs);
4086   return true;
4087 }
4088 
4089 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4090   MachineInstr &MI, MachineIRBuilder &B,
4091   GISelChangeObserver &Observer) const {
4092   Register Dst = MI.getOperand(0).getReg();
4093   LLT Ty = B.getMRI()->getType(Dst);
4094   unsigned Size = Ty.getSizeInBits();
4095   MachineFunction &MF = B.getMF();
4096 
4097   Observer.changingInstr(MI);
4098 
4099   // FIXME: We don't really need this intermediate instruction. The intrinsic
4100   // should be fixed to have a memory operand. Since it's readnone, we're not
4101   // allowed to add one.
4102   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4103   MI.RemoveOperand(1); // Remove intrinsic ID
4104 
4105   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4106   // TODO: Should this use datalayout alignment?
4107   const unsigned MemSize = (Size + 7) / 8;
4108   const Align MemAlign(4);
4109   MachineMemOperand *MMO = MF.getMachineMemOperand(
4110       MachinePointerInfo(),
4111       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4112           MachineMemOperand::MOInvariant,
4113       MemSize, MemAlign);
4114   MI.addMemOperand(MF, MMO);
4115 
4116   // There are no 96-bit result scalar loads, but widening to 128-bit should
4117   // always be legal. We may need to restore this to a 96-bit result if it turns
4118   // out this needs to be converted to a vector load during RegBankSelect.
4119   if (!isPowerOf2_32(Size)) {
4120     LegalizerHelper Helper(MF, *this, Observer, B);
4121     B.setInstr(MI);
4122 
4123     if (Ty.isVector())
4124       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4125     else
4126       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4127   }
4128 
4129   Observer.changedInstr(MI);
4130   return true;
4131 }
4132 
4133 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4134                                                 MachineRegisterInfo &MRI,
4135                                                 MachineIRBuilder &B) const {
4136   B.setInstr(MI);
4137 
4138   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4139   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4140       !ST.isTrapHandlerEnabled()) {
4141     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4142   } else {
4143     // Pass queue pointer to trap handler as input, and insert trap instruction
4144     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4145     const ArgDescriptor *Arg =
4146         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4147     if (!Arg)
4148       return false;
4149     MachineRegisterInfo &MRI = *B.getMRI();
4150     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4151     Register LiveIn = getLiveInRegister(
4152         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4153         /*InsertLiveInCopy=*/false);
4154     if (!loadInputValue(LiveIn, B, Arg))
4155       return false;
4156     B.buildCopy(SGPR01, LiveIn);
4157     B.buildInstr(AMDGPU::S_TRAP)
4158         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4159         .addReg(SGPR01, RegState::Implicit);
4160   }
4161 
4162   MI.eraseFromParent();
4163   return true;
4164 }
4165 
4166 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4167     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4168   B.setInstr(MI);
4169 
4170   // Is non-HSA path or trap-handler disabled? then, report a warning
4171   // accordingly
4172   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4173       !ST.isTrapHandlerEnabled()) {
4174     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4175                                      "debugtrap handler not supported",
4176                                      MI.getDebugLoc(), DS_Warning);
4177     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4178     Ctx.diagnose(NoTrap);
4179   } else {
4180     // Insert debug-trap instruction
4181     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4182   }
4183 
4184   MI.eraseFromParent();
4185   return true;
4186 }
4187 
4188 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4189                                             MachineIRBuilder &B,
4190                                             GISelChangeObserver &Observer) const {
4191   MachineRegisterInfo &MRI = *B.getMRI();
4192 
4193   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4194   auto IntrID = MI.getIntrinsicID();
4195   switch (IntrID) {
4196   case Intrinsic::amdgcn_if:
4197   case Intrinsic::amdgcn_else: {
4198     MachineInstr *Br = nullptr;
4199     MachineBasicBlock *UncondBrTarget = nullptr;
4200     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4201       const SIRegisterInfo *TRI
4202         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4203 
4204       B.setInstr(*BrCond);
4205       Register Def = MI.getOperand(1).getReg();
4206       Register Use = MI.getOperand(3).getReg();
4207 
4208       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4209       if (IntrID == Intrinsic::amdgcn_if) {
4210         B.buildInstr(AMDGPU::SI_IF)
4211           .addDef(Def)
4212           .addUse(Use)
4213           .addMBB(UncondBrTarget);
4214       } else {
4215         B.buildInstr(AMDGPU::SI_ELSE)
4216           .addDef(Def)
4217           .addUse(Use)
4218           .addMBB(UncondBrTarget)
4219           .addImm(0);
4220       }
4221 
4222       if (Br) {
4223         Br->getOperand(0).setMBB(CondBrTarget);
4224       } else {
4225         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4226         // since we're swapping branch targets it needs to be reinserted.
4227         // FIXME: IRTranslator should probably not do this
4228         B.buildBr(*CondBrTarget);
4229       }
4230 
4231       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4232       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4233       MI.eraseFromParent();
4234       BrCond->eraseFromParent();
4235       return true;
4236     }
4237 
4238     return false;
4239   }
4240   case Intrinsic::amdgcn_loop: {
4241     MachineInstr *Br = nullptr;
4242     MachineBasicBlock *UncondBrTarget = nullptr;
4243     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4244       const SIRegisterInfo *TRI
4245         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4246 
4247       B.setInstr(*BrCond);
4248 
4249       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4250       Register Reg = MI.getOperand(2).getReg();
4251       B.buildInstr(AMDGPU::SI_LOOP)
4252         .addUse(Reg)
4253         .addMBB(UncondBrTarget);
4254 
4255       if (Br)
4256         Br->getOperand(0).setMBB(CondBrTarget);
4257       else
4258         B.buildBr(*CondBrTarget);
4259 
4260       MI.eraseFromParent();
4261       BrCond->eraseFromParent();
4262       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4263       return true;
4264     }
4265 
4266     return false;
4267   }
4268   case Intrinsic::amdgcn_kernarg_segment_ptr:
4269     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4270       B.setInstr(MI);
4271       // This only makes sense to call in a kernel, so just lower to null.
4272       B.buildConstant(MI.getOperand(0).getReg(), 0);
4273       MI.eraseFromParent();
4274       return true;
4275     }
4276 
4277     return legalizePreloadedArgIntrin(
4278       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4279   case Intrinsic::amdgcn_implicitarg_ptr:
4280     return legalizeImplicitArgPtr(MI, MRI, B);
4281   case Intrinsic::amdgcn_workitem_id_x:
4282     return legalizePreloadedArgIntrin(MI, MRI, B,
4283                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4284   case Intrinsic::amdgcn_workitem_id_y:
4285     return legalizePreloadedArgIntrin(MI, MRI, B,
4286                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4287   case Intrinsic::amdgcn_workitem_id_z:
4288     return legalizePreloadedArgIntrin(MI, MRI, B,
4289                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4290   case Intrinsic::amdgcn_workgroup_id_x:
4291     return legalizePreloadedArgIntrin(MI, MRI, B,
4292                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4293   case Intrinsic::amdgcn_workgroup_id_y:
4294     return legalizePreloadedArgIntrin(MI, MRI, B,
4295                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4296   case Intrinsic::amdgcn_workgroup_id_z:
4297     return legalizePreloadedArgIntrin(MI, MRI, B,
4298                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4299   case Intrinsic::amdgcn_dispatch_ptr:
4300     return legalizePreloadedArgIntrin(MI, MRI, B,
4301                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4302   case Intrinsic::amdgcn_queue_ptr:
4303     return legalizePreloadedArgIntrin(MI, MRI, B,
4304                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4305   case Intrinsic::amdgcn_implicit_buffer_ptr:
4306     return legalizePreloadedArgIntrin(
4307       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4308   case Intrinsic::amdgcn_dispatch_id:
4309     return legalizePreloadedArgIntrin(MI, MRI, B,
4310                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4311   case Intrinsic::amdgcn_fdiv_fast:
4312     return legalizeFDIVFastIntrin(MI, MRI, B);
4313   case Intrinsic::amdgcn_is_shared:
4314     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4315   case Intrinsic::amdgcn_is_private:
4316     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4317   case Intrinsic::amdgcn_wavefrontsize: {
4318     B.setInstr(MI);
4319     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4320     MI.eraseFromParent();
4321     return true;
4322   }
4323   case Intrinsic::amdgcn_s_buffer_load:
4324     return legalizeSBufferLoad(MI, B, Observer);
4325   case Intrinsic::amdgcn_raw_buffer_store:
4326   case Intrinsic::amdgcn_struct_buffer_store:
4327     return legalizeBufferStore(MI, MRI, B, false, false);
4328   case Intrinsic::amdgcn_raw_buffer_store_format:
4329   case Intrinsic::amdgcn_struct_buffer_store_format:
4330     return legalizeBufferStore(MI, MRI, B, false, true);
4331   case Intrinsic::amdgcn_raw_tbuffer_store:
4332   case Intrinsic::amdgcn_struct_tbuffer_store:
4333     return legalizeBufferStore(MI, MRI, B, true, true);
4334   case Intrinsic::amdgcn_raw_buffer_load:
4335   case Intrinsic::amdgcn_struct_buffer_load:
4336     return legalizeBufferLoad(MI, MRI, B, false, false);
4337   case Intrinsic::amdgcn_raw_buffer_load_format:
4338   case Intrinsic::amdgcn_struct_buffer_load_format:
4339     return legalizeBufferLoad(MI, MRI, B, true, false);
4340   case Intrinsic::amdgcn_raw_tbuffer_load:
4341   case Intrinsic::amdgcn_struct_tbuffer_load:
4342     return legalizeBufferLoad(MI, MRI, B, true, true);
4343   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4344   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4345   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4346   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4347   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4348   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4349   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4350   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4351   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4352   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4353   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4354   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4355   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4356   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4357   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4358   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4359   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4360   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4361   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4362   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4363   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4364   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4365   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4366   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4367   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4368   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4369     return legalizeBufferAtomic(MI, B, IntrID);
4370   case Intrinsic::amdgcn_atomic_inc:
4371     return legalizeAtomicIncDec(MI, B, true);
4372   case Intrinsic::amdgcn_atomic_dec:
4373     return legalizeAtomicIncDec(MI, B, false);
4374   case Intrinsic::trap:
4375     return legalizeTrapIntrinsic(MI, MRI, B);
4376   case Intrinsic::debugtrap:
4377     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4378   default: {
4379     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4380             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4381       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4382     return true;
4383   }
4384   }
4385 
4386   return true;
4387 }
4388