1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Round the number of elements to the next power of two elements
40 static LLT getPow2VectorType(LLT Ty) {
41   unsigned NElts = Ty.getNumElements();
42   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
43   return Ty.changeNumElements(Pow2NElts);
44 }
45 
46 // Round the number of bits to the next power of two bits
47 static LLT getPow2ScalarType(LLT Ty) {
48   unsigned Bits = Ty.getSizeInBits();
49   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
50   return LLT::scalar(Pow2Bits);
51 }
52 
53 static LegalityPredicate isMultiple32(unsigned TypeIdx,
54                                       unsigned MaxSize = 1024) {
55   return [=](const LegalityQuery &Query) {
56     const LLT Ty = Query.Types[TypeIdx];
57     const LLT EltTy = Ty.getScalarType();
58     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
59   };
60 }
61 
62 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
63   return [=](const LegalityQuery &Query) {
64     const LLT Ty = Query.Types[TypeIdx];
65     return Ty.isVector() &&
66            Ty.getNumElements() % 2 != 0 &&
67            Ty.getElementType().getSizeInBits() < 32 &&
68            Ty.getSizeInBits() % 32 != 0;
69   };
70 }
71 
72 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
73   return [=](const LegalityQuery &Query) {
74     const LLT Ty = Query.Types[TypeIdx];
75     const LLT EltTy = Ty.getScalarType();
76     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
77   };
78 }
79 
80 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     const LLT EltTy = Ty.getElementType();
84     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
85   };
86 }
87 
88 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getElementType();
92     unsigned Size = Ty.getSizeInBits();
93     unsigned Pieces = (Size + 63) / 64;
94     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
95     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
96   };
97 }
98 
99 // Increase the number of vector elements to reach the next multiple of 32-bit
100 // type.
101 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104 
105     const LLT EltTy = Ty.getElementType();
106     const int Size = Ty.getSizeInBits();
107     const int EltSize = EltTy.getSizeInBits();
108     const int NextMul32 = (Size + 31) / 32;
109 
110     assert(EltSize < 32);
111 
112     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
113     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
114   };
115 }
116 
117 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
118   return [=](const LegalityQuery &Query) {
119     const LLT Ty = Query.Types[TypeIdx];
120     unsigned Size = Ty.getSizeInBits();
121 
122     LLT CoercedTy;
123     if (Size < 32) {
124       // <2 x s8> -> s16
125       assert(Size == 16);
126       CoercedTy = LLT::scalar(16);
127     } else
128       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
129 
130     return std::make_pair(TypeIdx, CoercedTy);
131   };
132 }
133 
134 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
135   return [=](const LegalityQuery &Query) {
136     const LLT QueryTy = Query.Types[TypeIdx];
137     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
138   };
139 }
140 
141 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
142   return [=](const LegalityQuery &Query) {
143     const LLT QueryTy = Query.Types[TypeIdx];
144     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
145   };
146 }
147 
148 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
149   return [=](const LegalityQuery &Query) {
150     const LLT QueryTy = Query.Types[TypeIdx];
151     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
152   };
153 }
154 
155 static bool isRegisterSize(unsigned Size) {
156   return Size % 32 == 0 && Size <= 1024;
157 }
158 
159 static bool isRegisterVectorElementType(LLT EltTy) {
160   const int EltSize = EltTy.getSizeInBits();
161   return EltSize == 16 || EltSize % 32 == 0;
162 }
163 
164 static bool isRegisterVectorType(LLT Ty) {
165   const int EltSize = Ty.getElementType().getSizeInBits();
166   return EltSize == 32 || EltSize == 64 ||
167          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
168          EltSize == 128 || EltSize == 256;
169 }
170 
171 static bool isRegisterType(LLT Ty) {
172   if (!isRegisterSize(Ty.getSizeInBits()))
173     return false;
174 
175   if (Ty.isVector())
176     return isRegisterVectorType(Ty);
177 
178   return true;
179 }
180 
181 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
182 // v2s16.
183 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
184   return [=](const LegalityQuery &Query) {
185     return isRegisterType(Query.Types[TypeIdx]);
186   };
187 }
188 
189 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
190   return [=](const LegalityQuery &Query) {
191     const LLT QueryTy = Query.Types[TypeIdx];
192     if (!QueryTy.isVector())
193       return false;
194     const LLT EltTy = QueryTy.getElementType();
195     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
196   };
197 }
198 
199 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
200   return [=](const LegalityQuery &Query) {
201     const LLT Ty = Query.Types[TypeIdx];
202     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
203            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
204   };
205 }
206 
207 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
208 // handle some operations by just promoting the register during
209 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
210 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
211                                     bool IsLoad) {
212   switch (AS) {
213   case AMDGPUAS::PRIVATE_ADDRESS:
214     // FIXME: Private element size.
215     return 32;
216   case AMDGPUAS::LOCAL_ADDRESS:
217     return ST.useDS128() ? 128 : 64;
218   case AMDGPUAS::GLOBAL_ADDRESS:
219   case AMDGPUAS::CONSTANT_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
221     // Treat constant and global as identical. SMRD loads are sometimes usable for
222     // global loads (ideally constant address space should be eliminated)
223     // depending on the context. Legality cannot be context dependent, but
224     // RegBankSelect can split the load as necessary depending on the pointer
225     // register bank/uniformity and if the memory is invariant or not written in a
226     // kernel.
227     return IsLoad ? 512 : 128;
228   default:
229     // Flat addresses may contextually need to be split to 32-bit parts if they
230     // may alias scratch depending on the subtarget.
231     return 128;
232   }
233 }
234 
235 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
236                                  const LegalityQuery &Query,
237                                  unsigned Opcode) {
238   const LLT Ty = Query.Types[0];
239 
240   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
241   const bool IsLoad = Opcode != AMDGPU::G_STORE;
242 
243   unsigned RegSize = Ty.getSizeInBits();
244   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
245   unsigned Align = Query.MMODescrs[0].AlignInBits;
246   unsigned AS = Query.Types[1].getAddressSpace();
247 
248   // All of these need to be custom lowered to cast the pointer operand.
249   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
250     return false;
251 
252   // TODO: We should be able to widen loads if the alignment is high enough, but
253   // we also need to modify the memory access size.
254 #if 0
255   // Accept widening loads based on alignment.
256   if (IsLoad && MemSize < Size)
257     MemSize = std::max(MemSize, Align);
258 #endif
259 
260   // Only 1-byte and 2-byte to 32-bit extloads are valid.
261   if (MemSize != RegSize && RegSize != 32)
262     return false;
263 
264   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
265     return false;
266 
267   switch (MemSize) {
268   case 8:
269   case 16:
270   case 32:
271   case 64:
272   case 128:
273     break;
274   case 96:
275     if (!ST.hasDwordx3LoadStores())
276       return false;
277     break;
278   case 256:
279   case 512:
280     // These may contextually need to be broken down.
281     break;
282   default:
283     return false;
284   }
285 
286   assert(RegSize >= MemSize);
287 
288   if (Align < MemSize) {
289     const SITargetLowering *TLI = ST.getTargetLowering();
290     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
291       return false;
292   }
293 
294   return true;
295 }
296 
297 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
298                              unsigned Opcode) {
299   const LLT Ty = Query.Types[0];
300   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode);
301 }
302 
303 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
304                                          const GCNTargetMachine &TM)
305   :  ST(ST_) {
306   using namespace TargetOpcode;
307 
308   auto GetAddrSpacePtr = [&TM](unsigned AS) {
309     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
310   };
311 
312   const LLT S1 = LLT::scalar(1);
313   const LLT S16 = LLT::scalar(16);
314   const LLT S32 = LLT::scalar(32);
315   const LLT S64 = LLT::scalar(64);
316   const LLT S128 = LLT::scalar(128);
317   const LLT S256 = LLT::scalar(256);
318   const LLT S512 = LLT::scalar(512);
319   const LLT S1024 = LLT::scalar(1024);
320 
321   const LLT V2S16 = LLT::vector(2, 16);
322   const LLT V4S16 = LLT::vector(4, 16);
323 
324   const LLT V2S32 = LLT::vector(2, 32);
325   const LLT V3S32 = LLT::vector(3, 32);
326   const LLT V4S32 = LLT::vector(4, 32);
327   const LLT V5S32 = LLT::vector(5, 32);
328   const LLT V6S32 = LLT::vector(6, 32);
329   const LLT V7S32 = LLT::vector(7, 32);
330   const LLT V8S32 = LLT::vector(8, 32);
331   const LLT V9S32 = LLT::vector(9, 32);
332   const LLT V10S32 = LLT::vector(10, 32);
333   const LLT V11S32 = LLT::vector(11, 32);
334   const LLT V12S32 = LLT::vector(12, 32);
335   const LLT V13S32 = LLT::vector(13, 32);
336   const LLT V14S32 = LLT::vector(14, 32);
337   const LLT V15S32 = LLT::vector(15, 32);
338   const LLT V16S32 = LLT::vector(16, 32);
339   const LLT V32S32 = LLT::vector(32, 32);
340 
341   const LLT V2S64 = LLT::vector(2, 64);
342   const LLT V3S64 = LLT::vector(3, 64);
343   const LLT V4S64 = LLT::vector(4, 64);
344   const LLT V5S64 = LLT::vector(5, 64);
345   const LLT V6S64 = LLT::vector(6, 64);
346   const LLT V7S64 = LLT::vector(7, 64);
347   const LLT V8S64 = LLT::vector(8, 64);
348   const LLT V16S64 = LLT::vector(16, 64);
349 
350   std::initializer_list<LLT> AllS32Vectors =
351     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
352      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
353   std::initializer_list<LLT> AllS64Vectors =
354     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
355 
356   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
357   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
358   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
359   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
360   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
361   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
362   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
363 
364   const LLT CodePtr = FlatPtr;
365 
366   const std::initializer_list<LLT> AddrSpaces64 = {
367     GlobalPtr, ConstantPtr, FlatPtr
368   };
369 
370   const std::initializer_list<LLT> AddrSpaces32 = {
371     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
372   };
373 
374   const std::initializer_list<LLT> FPTypesBase = {
375     S32, S64
376   };
377 
378   const std::initializer_list<LLT> FPTypes16 = {
379     S32, S64, S16
380   };
381 
382   const std::initializer_list<LLT> FPTypesPK16 = {
383     S32, S64, S16, V2S16
384   };
385 
386   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
387 
388   setAction({G_BRCOND, S1}, Legal); // VCC branches
389   setAction({G_BRCOND, S32}, Legal); // SCC branches
390 
391   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
392   // elements for v3s16
393   getActionDefinitionsBuilder(G_PHI)
394     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
395     .legalFor(AllS32Vectors)
396     .legalFor(AllS64Vectors)
397     .legalFor(AddrSpaces64)
398     .legalFor(AddrSpaces32)
399     .clampScalar(0, S32, S256)
400     .widenScalarToNextPow2(0, 32)
401     .clampMaxNumElements(0, S32, 16)
402     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
403     .legalIf(isPointer(0));
404 
405   if (ST.hasVOP3PInsts()) {
406     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
407       .legalFor({S32, S16, V2S16})
408       .clampScalar(0, S16, S32)
409       .clampMaxNumElements(0, S16, 2)
410       .scalarize(0)
411       .widenScalarToNextPow2(0, 32);
412   } else if (ST.has16BitInsts()) {
413     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
414       .legalFor({S32, S16})
415       .clampScalar(0, S16, S32)
416       .scalarize(0)
417       .widenScalarToNextPow2(0, 32);
418   } else {
419     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
420       .legalFor({S32})
421       .clampScalar(0, S32, S32)
422       .scalarize(0);
423   }
424 
425   // FIXME: Not really legal. Placeholder for custom lowering.
426   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
427     .customFor({S32, S64})
428     .clampScalar(0, S32, S64)
429     .widenScalarToNextPow2(0, 32)
430     .scalarize(0);
431 
432   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
433     .legalFor({S32})
434     .clampScalar(0, S32, S32)
435     .scalarize(0);
436 
437   // Report legal for any types we can handle anywhere. For the cases only legal
438   // on the SALU, RegBankSelect will be able to re-legalize.
439   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
440     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
441     .clampScalar(0, S32, S64)
442     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
443     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
444     .widenScalarToNextPow2(0)
445     .scalarize(0);
446 
447   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
448                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
449     .legalFor({{S32, S1}, {S32, S32}})
450     .minScalar(0, S32)
451     // TODO: .scalarize(0)
452     .lower();
453 
454   getActionDefinitionsBuilder(G_BITCAST)
455     // Don't worry about the size constraint.
456     .legalIf(all(isRegisterType(0), isRegisterType(1)))
457     .lower();
458 
459 
460   getActionDefinitionsBuilder(G_CONSTANT)
461     .legalFor({S1, S32, S64, S16, GlobalPtr,
462                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
463     .clampScalar(0, S32, S64)
464     .widenScalarToNextPow2(0)
465     .legalIf(isPointer(0));
466 
467   getActionDefinitionsBuilder(G_FCONSTANT)
468     .legalFor({S32, S64, S16})
469     .clampScalar(0, S16, S64);
470 
471   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
472       .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
473                  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
474       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
475       .clampScalarOrElt(0, S32, S1024)
476       .legalIf(isMultiple32(0))
477       .widenScalarToNextPow2(0, 32)
478       .clampMaxNumElements(0, S32, 16);
479 
480   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
481 
482   // If the amount is divergent, we have to do a wave reduction to get the
483   // maximum value, so this is expanded during RegBankSelect.
484   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
485     .legalFor({{PrivatePtr, S32}});
486 
487   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
488     .unsupportedFor({PrivatePtr})
489     .custom();
490   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
491 
492   auto &FPOpActions = getActionDefinitionsBuilder(
493     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
494     .legalFor({S32, S64});
495   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
496     .customFor({S32, S64});
497   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
498     .customFor({S32, S64});
499 
500   if (ST.has16BitInsts()) {
501     if (ST.hasVOP3PInsts())
502       FPOpActions.legalFor({S16, V2S16});
503     else
504       FPOpActions.legalFor({S16});
505 
506     TrigActions.customFor({S16});
507     FDIVActions.customFor({S16});
508   }
509 
510   auto &MinNumMaxNum = getActionDefinitionsBuilder({
511       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
512 
513   if (ST.hasVOP3PInsts()) {
514     MinNumMaxNum.customFor(FPTypesPK16)
515       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
516       .clampMaxNumElements(0, S16, 2)
517       .clampScalar(0, S16, S64)
518       .scalarize(0);
519   } else if (ST.has16BitInsts()) {
520     MinNumMaxNum.customFor(FPTypes16)
521       .clampScalar(0, S16, S64)
522       .scalarize(0);
523   } else {
524     MinNumMaxNum.customFor(FPTypesBase)
525       .clampScalar(0, S32, S64)
526       .scalarize(0);
527   }
528 
529   if (ST.hasVOP3PInsts())
530     FPOpActions.clampMaxNumElements(0, S16, 2);
531 
532   FPOpActions
533     .scalarize(0)
534     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
535 
536   TrigActions
537     .scalarize(0)
538     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
539 
540   FDIVActions
541     .scalarize(0)
542     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
543 
544   getActionDefinitionsBuilder({G_FNEG, G_FABS})
545     .legalFor(FPTypesPK16)
546     .clampMaxNumElements(0, S16, 2)
547     .scalarize(0)
548     .clampScalar(0, S16, S64);
549 
550   if (ST.has16BitInsts()) {
551     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
552       .legalFor({S32, S64, S16})
553       .scalarize(0)
554       .clampScalar(0, S16, S64);
555   } else {
556     getActionDefinitionsBuilder(G_FSQRT)
557       .legalFor({S32, S64})
558       .scalarize(0)
559       .clampScalar(0, S32, S64);
560 
561     if (ST.hasFractBug()) {
562       getActionDefinitionsBuilder(G_FFLOOR)
563         .customFor({S64})
564         .legalFor({S32, S64})
565         .scalarize(0)
566         .clampScalar(0, S32, S64);
567     } else {
568       getActionDefinitionsBuilder(G_FFLOOR)
569         .legalFor({S32, S64})
570         .scalarize(0)
571         .clampScalar(0, S32, S64);
572     }
573   }
574 
575   getActionDefinitionsBuilder(G_FPTRUNC)
576     .legalFor({{S32, S64}, {S16, S32}})
577     .scalarize(0)
578     .lower();
579 
580   getActionDefinitionsBuilder(G_FPEXT)
581     .legalFor({{S64, S32}, {S32, S16}})
582     .lowerFor({{S64, S16}}) // FIXME: Implement
583     .scalarize(0);
584 
585   getActionDefinitionsBuilder(G_FSUB)
586       // Use actual fsub instruction
587       .legalFor({S32})
588       // Must use fadd + fneg
589       .lowerFor({S64, S16, V2S16})
590       .scalarize(0)
591       .clampScalar(0, S32, S64);
592 
593   // Whether this is legal depends on the floating point mode for the function.
594   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
595   if (ST.hasMadF16())
596     FMad.customFor({S32, S16});
597   else
598     FMad.customFor({S32});
599   FMad.scalarize(0)
600       .lower();
601 
602   // TODO: Do we need to clamp maximum bitwidth?
603   getActionDefinitionsBuilder(G_TRUNC)
604     .legalIf(isScalar(0))
605     .legalFor({{V2S16, V2S32}})
606     .clampMaxNumElements(0, S16, 2)
607     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
608     // situations (like an invalid implicit use), we don't want to infinite loop
609     // in the legalizer.
610     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
611     .alwaysLegal();
612 
613   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
614     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
615                {S32, S1}, {S64, S1}, {S16, S1}})
616     .scalarize(0)
617     .clampScalar(0, S32, S64)
618     .widenScalarToNextPow2(1, 32);
619 
620   // TODO: Split s1->s64 during regbankselect for VALU.
621   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
622     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
623     .lowerFor({{S32, S64}})
624     .lowerIf(typeIs(1, S1))
625     .customFor({{S64, S64}});
626   if (ST.has16BitInsts())
627     IToFP.legalFor({{S16, S16}});
628   IToFP.clampScalar(1, S32, S64)
629        .scalarize(0)
630        .widenScalarToNextPow2(1);
631 
632   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
633     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
634     .customFor({{S64, S64}});
635   if (ST.has16BitInsts())
636     FPToI.legalFor({{S16, S16}});
637   else
638     FPToI.minScalar(1, S32);
639 
640   FPToI.minScalar(0, S32)
641        .scalarize(0)
642        .lower();
643 
644   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
645     .scalarize(0)
646     .lower();
647 
648   if (ST.has16BitInsts()) {
649     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
650       .legalFor({S16, S32, S64})
651       .clampScalar(0, S16, S64)
652       .scalarize(0);
653   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
654     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
655       .legalFor({S32, S64})
656       .clampScalar(0, S32, S64)
657       .scalarize(0);
658   } else {
659     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
660       .legalFor({S32})
661       .customFor({S64})
662       .clampScalar(0, S32, S64)
663       .scalarize(0);
664   }
665 
666   // FIXME: Clamp offset operand.
667   getActionDefinitionsBuilder(G_PTR_ADD)
668     .legalIf(isPointer(0))
669     .scalarize(0);
670 
671   getActionDefinitionsBuilder(G_PTRMASK)
672     .legalIf(typeInSet(1, {S64, S32}))
673     .minScalar(1, S32)
674     .maxScalarIf(sizeIs(0, 32), 1, S32)
675     .maxScalarIf(sizeIs(0, 64), 1, S64)
676     .scalarize(0);
677 
678   auto &CmpBuilder =
679     getActionDefinitionsBuilder(G_ICMP)
680     // The compare output type differs based on the register bank of the output,
681     // so make both s1 and s32 legal.
682     //
683     // Scalar compares producing output in scc will be promoted to s32, as that
684     // is the allocatable register type that will be needed for the copy from
685     // scc. This will be promoted during RegBankSelect, and we assume something
686     // before that won't try to use s32 result types.
687     //
688     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
689     // bank.
690     .legalForCartesianProduct(
691       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
692     .legalForCartesianProduct(
693       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
694   if (ST.has16BitInsts()) {
695     CmpBuilder.legalFor({{S1, S16}});
696   }
697 
698   CmpBuilder
699     .widenScalarToNextPow2(1)
700     .clampScalar(1, S32, S64)
701     .scalarize(0)
702     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
703 
704   getActionDefinitionsBuilder(G_FCMP)
705     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
706     .widenScalarToNextPow2(1)
707     .clampScalar(1, S32, S64)
708     .scalarize(0);
709 
710   // FIXME: fpow has a selection pattern that should move to custom lowering.
711   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
712   if (ST.has16BitInsts())
713     Exp2Ops.legalFor({S32, S16});
714   else
715     Exp2Ops.legalFor({S32});
716   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
717   Exp2Ops.scalarize(0);
718 
719   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
720   if (ST.has16BitInsts())
721     ExpOps.customFor({{S32}, {S16}});
722   else
723     ExpOps.customFor({S32});
724   ExpOps.clampScalar(0, MinScalarFPTy, S32)
725         .scalarize(0);
726 
727   // The 64-bit versions produce 32-bit results, but only on the SALU.
728   getActionDefinitionsBuilder(G_CTPOP)
729     .legalFor({{S32, S32}, {S32, S64}})
730     .clampScalar(0, S32, S32)
731     .clampScalar(1, S32, S64)
732     .scalarize(0)
733     .widenScalarToNextPow2(0, 32)
734     .widenScalarToNextPow2(1, 32);
735 
736   // The hardware instructions return a different result on 0 than the generic
737   // instructions expect. The hardware produces -1, but these produce the
738   // bitwidth.
739   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
740     .scalarize(0)
741     .clampScalar(0, S32, S32)
742     .clampScalar(1, S32, S64)
743     .widenScalarToNextPow2(0, 32)
744     .widenScalarToNextPow2(1, 32)
745     .lower();
746 
747   // The 64-bit versions produce 32-bit results, but only on the SALU.
748   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
749     .legalFor({{S32, S32}, {S32, S64}})
750     .clampScalar(0, S32, S32)
751     .clampScalar(1, S32, S64)
752     .scalarize(0)
753     .widenScalarToNextPow2(0, 32)
754     .widenScalarToNextPow2(1, 32);
755 
756   getActionDefinitionsBuilder(G_BITREVERSE)
757     .legalFor({S32})
758     .clampScalar(0, S32, S32)
759     .scalarize(0);
760 
761   if (ST.has16BitInsts()) {
762     getActionDefinitionsBuilder(G_BSWAP)
763       .legalFor({S16, S32, V2S16})
764       .clampMaxNumElements(0, S16, 2)
765       // FIXME: Fixing non-power-of-2 before clamp is workaround for
766       // narrowScalar limitation.
767       .widenScalarToNextPow2(0)
768       .clampScalar(0, S16, S32)
769       .scalarize(0);
770 
771     if (ST.hasVOP3PInsts()) {
772       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
773         .legalFor({S32, S16, V2S16})
774         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
775         .clampMaxNumElements(0, S16, 2)
776         .minScalar(0, S16)
777         .widenScalarToNextPow2(0)
778         .scalarize(0)
779         .lower();
780     } else {
781       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
782         .legalFor({S32, S16})
783         .widenScalarToNextPow2(0)
784         .minScalar(0, S16)
785         .scalarize(0)
786         .lower();
787     }
788   } else {
789     // TODO: Should have same legality without v_perm_b32
790     getActionDefinitionsBuilder(G_BSWAP)
791       .legalFor({S32})
792       .lowerIf(scalarNarrowerThan(0, 32))
793       // FIXME: Fixing non-power-of-2 before clamp is workaround for
794       // narrowScalar limitation.
795       .widenScalarToNextPow2(0)
796       .maxScalar(0, S32)
797       .scalarize(0)
798       .lower();
799 
800     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
801       .legalFor({S32})
802       .minScalar(0, S32)
803       .widenScalarToNextPow2(0)
804       .scalarize(0)
805       .lower();
806   }
807 
808   getActionDefinitionsBuilder(G_INTTOPTR)
809     // List the common cases
810     .legalForCartesianProduct(AddrSpaces64, {S64})
811     .legalForCartesianProduct(AddrSpaces32, {S32})
812     .scalarize(0)
813     // Accept any address space as long as the size matches
814     .legalIf(sameSize(0, 1))
815     .widenScalarIf(smallerThan(1, 0),
816       [](const LegalityQuery &Query) {
817         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
818       })
819     .narrowScalarIf(largerThan(1, 0),
820       [](const LegalityQuery &Query) {
821         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
822       });
823 
824   getActionDefinitionsBuilder(G_PTRTOINT)
825     // List the common cases
826     .legalForCartesianProduct(AddrSpaces64, {S64})
827     .legalForCartesianProduct(AddrSpaces32, {S32})
828     .scalarize(0)
829     // Accept any address space as long as the size matches
830     .legalIf(sameSize(0, 1))
831     .widenScalarIf(smallerThan(0, 1),
832       [](const LegalityQuery &Query) {
833         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
834       })
835     .narrowScalarIf(
836       largerThan(0, 1),
837       [](const LegalityQuery &Query) {
838         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
839       });
840 
841   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
842     .scalarize(0)
843     .custom();
844 
845   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
846                                     bool IsLoad) -> bool {
847     const LLT DstTy = Query.Types[0];
848 
849     // Split vector extloads.
850     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
851     unsigned Align = Query.MMODescrs[0].AlignInBits;
852 
853     if (MemSize < DstTy.getSizeInBits())
854       MemSize = std::max(MemSize, Align);
855 
856     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
857       return true;
858 
859     const LLT PtrTy = Query.Types[1];
860     unsigned AS = PtrTy.getAddressSpace();
861     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
862       return true;
863 
864     // Catch weird sized loads that don't evenly divide into the access sizes
865     // TODO: May be able to widen depending on alignment etc.
866     unsigned NumRegs = (MemSize + 31) / 32;
867     if (NumRegs == 3) {
868       if (!ST.hasDwordx3LoadStores())
869         return true;
870     } else {
871       // If the alignment allows, these should have been widened.
872       if (!isPowerOf2_32(NumRegs))
873         return true;
874     }
875 
876     if (Align < MemSize) {
877       const SITargetLowering *TLI = ST.getTargetLowering();
878       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
879     }
880 
881     return false;
882   };
883 
884   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
885                                          unsigned Opc) -> bool {
886     unsigned Size = Query.Types[0].getSizeInBits();
887     if (isPowerOf2_32(Size))
888       return false;
889 
890     if (Size == 96 && ST.hasDwordx3LoadStores())
891       return false;
892 
893     unsigned AddrSpace = Query.Types[1].getAddressSpace();
894     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
895       return false;
896 
897     unsigned Align = Query.MMODescrs[0].AlignInBits;
898     unsigned RoundedSize = NextPowerOf2(Size);
899     return (Align >= RoundedSize);
900   };
901 
902   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
903   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
904   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
905 
906   // TODO: Refine based on subtargets which support unaligned access or 128-bit
907   // LDS
908   // TODO: Unsupported flat for SI.
909 
910   for (unsigned Op : {G_LOAD, G_STORE}) {
911     const bool IsStore = Op == G_STORE;
912 
913     auto &Actions = getActionDefinitionsBuilder(Op);
914     // Whitelist some common cases.
915     // TODO: Does this help compile time at all?
916     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
917                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
918                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
919                                       {S64, GlobalPtr, 64, GlobalAlign32},
920                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
921                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
922                                       {S32, GlobalPtr, 8, GlobalAlign8},
923                                       {S32, GlobalPtr, 16, GlobalAlign16},
924 
925                                       {S32, LocalPtr, 32, 32},
926                                       {S64, LocalPtr, 64, 32},
927                                       {V2S32, LocalPtr, 64, 32},
928                                       {S32, LocalPtr, 8, 8},
929                                       {S32, LocalPtr, 16, 16},
930                                       {V2S16, LocalPtr, 32, 32},
931 
932                                       {S32, PrivatePtr, 32, 32},
933                                       {S32, PrivatePtr, 8, 8},
934                                       {S32, PrivatePtr, 16, 16},
935                                       {V2S16, PrivatePtr, 32, 32},
936 
937                                       {S32, ConstantPtr, 32, GlobalAlign32},
938                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
939                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
940                                       {S64, ConstantPtr, 64, GlobalAlign32},
941                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
942     Actions.legalIf(
943       [=](const LegalityQuery &Query) -> bool {
944         return isLoadStoreLegal(ST, Query, Op);
945       });
946 
947     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
948     // 64-bits.
949     //
950     // TODO: Should generalize bitcast action into coerce, which will also cover
951     // inserting addrspacecasts.
952     Actions.customIf(typeIs(1, Constant32Ptr));
953 
954     // Turn any illegal element vectors into something easier to deal
955     // with. These will ultimately produce 32-bit scalar shifts to extract the
956     // parts anyway.
957     //
958     // For odd 16-bit element vectors, prefer to split those into pieces with
959     // 16-bit vector parts.
960     Actions.bitcastIf(
961       [=](const LegalityQuery &Query) -> bool {
962         LLT Ty = Query.Types[0];
963         return Ty.isVector() &&
964                isRegisterSize(Ty.getSizeInBits()) &&
965                !isRegisterVectorElementType(Ty.getElementType());
966       }, bitcastToRegisterType(0));
967 
968     Actions
969         .customIf(typeIs(1, Constant32Ptr))
970         // Widen suitably aligned loads by loading extra elements.
971         .moreElementsIf([=](const LegalityQuery &Query) {
972             const LLT Ty = Query.Types[0];
973             return Op == G_LOAD && Ty.isVector() &&
974                    shouldWidenLoadResult(Query, Op);
975           }, moreElementsToNextPow2(0))
976         .widenScalarIf([=](const LegalityQuery &Query) {
977             const LLT Ty = Query.Types[0];
978             return Op == G_LOAD && !Ty.isVector() &&
979                    shouldWidenLoadResult(Query, Op);
980           }, widenScalarOrEltToNextPow2(0))
981         .narrowScalarIf(
982             [=](const LegalityQuery &Query) -> bool {
983               return !Query.Types[0].isVector() &&
984                      needToSplitMemOp(Query, Op == G_LOAD);
985             },
986             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
987               const LLT DstTy = Query.Types[0];
988               const LLT PtrTy = Query.Types[1];
989 
990               const unsigned DstSize = DstTy.getSizeInBits();
991               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
992 
993               // Split extloads.
994               if (DstSize > MemSize)
995                 return std::make_pair(0, LLT::scalar(MemSize));
996 
997               if (!isPowerOf2_32(DstSize)) {
998                 // We're probably decomposing an odd sized store. Try to split
999                 // to the widest type. TODO: Account for alignment. As-is it
1000                 // should be OK, since the new parts will be further legalized.
1001                 unsigned FloorSize = PowerOf2Floor(DstSize);
1002                 return std::make_pair(0, LLT::scalar(FloorSize));
1003               }
1004 
1005               if (DstSize > 32 && (DstSize % 32 != 0)) {
1006                 // FIXME: Need a way to specify non-extload of larger size if
1007                 // suitably aligned.
1008                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1009               }
1010 
1011               unsigned MaxSize = maxSizeForAddrSpace(ST,
1012                                                      PtrTy.getAddressSpace(),
1013                                                      Op == G_LOAD);
1014               if (MemSize > MaxSize)
1015                 return std::make_pair(0, LLT::scalar(MaxSize));
1016 
1017               unsigned Align = Query.MMODescrs[0].AlignInBits;
1018               return std::make_pair(0, LLT::scalar(Align));
1019             })
1020         .fewerElementsIf(
1021             [=](const LegalityQuery &Query) -> bool {
1022               return Query.Types[0].isVector() &&
1023                      needToSplitMemOp(Query, Op == G_LOAD);
1024             },
1025             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1026               const LLT DstTy = Query.Types[0];
1027               const LLT PtrTy = Query.Types[1];
1028 
1029               LLT EltTy = DstTy.getElementType();
1030               unsigned MaxSize = maxSizeForAddrSpace(ST,
1031                                                      PtrTy.getAddressSpace(),
1032                                                      Op == G_LOAD);
1033 
1034               // FIXME: Handle widened to power of 2 results better. This ends
1035               // up scalarizing.
1036               // FIXME: 3 element stores scalarized on SI
1037 
1038               // Split if it's too large for the address space.
1039               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1040                 unsigned NumElts = DstTy.getNumElements();
1041                 unsigned EltSize = EltTy.getSizeInBits();
1042 
1043                 if (MaxSize % EltSize == 0) {
1044                   return std::make_pair(
1045                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1046                 }
1047 
1048                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1049 
1050                 // FIXME: Refine when odd breakdowns handled
1051                 // The scalars will need to be re-legalized.
1052                 if (NumPieces == 1 || NumPieces >= NumElts ||
1053                     NumElts % NumPieces != 0)
1054                   return std::make_pair(0, EltTy);
1055 
1056                 return std::make_pair(0,
1057                                       LLT::vector(NumElts / NumPieces, EltTy));
1058               }
1059 
1060               // FIXME: We could probably handle weird extending loads better.
1061               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1062               if (DstTy.getSizeInBits() > MemSize)
1063                 return std::make_pair(0, EltTy);
1064 
1065               unsigned EltSize = EltTy.getSizeInBits();
1066               unsigned DstSize = DstTy.getSizeInBits();
1067               if (!isPowerOf2_32(DstSize)) {
1068                 // We're probably decomposing an odd sized store. Try to split
1069                 // to the widest type. TODO: Account for alignment. As-is it
1070                 // should be OK, since the new parts will be further legalized.
1071                 unsigned FloorSize = PowerOf2Floor(DstSize);
1072                 return std::make_pair(
1073                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1074               }
1075 
1076               // Need to split because of alignment.
1077               unsigned Align = Query.MMODescrs[0].AlignInBits;
1078               if (EltSize > Align &&
1079                   (EltSize / Align < DstTy.getNumElements())) {
1080                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1081               }
1082 
1083               // May need relegalization for the scalars.
1084               return std::make_pair(0, EltTy);
1085             })
1086         .minScalar(0, S32);
1087 
1088     if (IsStore)
1089       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1090 
1091     // TODO: Need a bitcast lower option?
1092     Actions
1093         .widenScalarToNextPow2(0)
1094         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1095   }
1096 
1097   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1098                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1099                                                   {S32, GlobalPtr, 16, 2 * 8},
1100                                                   {S32, LocalPtr, 8, 8},
1101                                                   {S32, LocalPtr, 16, 16},
1102                                                   {S32, PrivatePtr, 8, 8},
1103                                                   {S32, PrivatePtr, 16, 16},
1104                                                   {S32, ConstantPtr, 8, 8},
1105                                                   {S32, ConstantPtr, 16, 2 * 8}});
1106   if (ST.hasFlatAddressSpace()) {
1107     ExtLoads.legalForTypesWithMemDesc(
1108         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1109   }
1110 
1111   ExtLoads.clampScalar(0, S32, S32)
1112           .widenScalarToNextPow2(0)
1113           .unsupportedIfMemSizeNotPow2()
1114           .lower();
1115 
1116   auto &Atomics = getActionDefinitionsBuilder(
1117     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1118      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1119      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1120      G_ATOMICRMW_UMIN})
1121     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1122                {S64, GlobalPtr}, {S64, LocalPtr}});
1123   if (ST.hasFlatAddressSpace()) {
1124     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1125   }
1126 
1127   if (ST.hasLDSFPAtomics()) {
1128     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1129       .legalFor({{S32, LocalPtr}});
1130   }
1131 
1132   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1133   // demarshalling
1134   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1135     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1136                 {S32, FlatPtr}, {S64, FlatPtr}})
1137     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1138                {S32, RegionPtr}, {S64, RegionPtr}});
1139   // TODO: Pointer types, any 32-bit or 64-bit vector
1140 
1141   // Condition should be s32 for scalar, s1 for vector.
1142   getActionDefinitionsBuilder(G_SELECT)
1143     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1144           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1145           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1146     .clampScalar(0, S16, S64)
1147     .scalarize(1)
1148     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1149     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1150     .clampMaxNumElements(0, S32, 2)
1151     .clampMaxNumElements(0, LocalPtr, 2)
1152     .clampMaxNumElements(0, PrivatePtr, 2)
1153     .scalarize(0)
1154     .widenScalarToNextPow2(0)
1155     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1156 
1157   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1158   // be more flexible with the shift amount type.
1159   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1160     .legalFor({{S32, S32}, {S64, S32}});
1161   if (ST.has16BitInsts()) {
1162     if (ST.hasVOP3PInsts()) {
1163       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1164             .clampMaxNumElements(0, S16, 2);
1165     } else
1166       Shifts.legalFor({{S16, S16}});
1167 
1168     // TODO: Support 16-bit shift amounts for all types
1169     Shifts.widenScalarIf(
1170       [=](const LegalityQuery &Query) {
1171         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1172         // 32-bit amount.
1173         const LLT ValTy = Query.Types[0];
1174         const LLT AmountTy = Query.Types[1];
1175         return ValTy.getSizeInBits() <= 16 &&
1176                AmountTy.getSizeInBits() < 16;
1177       }, changeTo(1, S16));
1178     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1179     Shifts.clampScalar(1, S32, S32);
1180     Shifts.clampScalar(0, S16, S64);
1181     Shifts.widenScalarToNextPow2(0, 16);
1182   } else {
1183     // Make sure we legalize the shift amount type first, as the general
1184     // expansion for the shifted type will produce much worse code if it hasn't
1185     // been truncated already.
1186     Shifts.clampScalar(1, S32, S32);
1187     Shifts.clampScalar(0, S32, S64);
1188     Shifts.widenScalarToNextPow2(0, 32);
1189   }
1190   Shifts.scalarize(0);
1191 
1192   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1193     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1194     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1195     unsigned IdxTypeIdx = 2;
1196 
1197     getActionDefinitionsBuilder(Op)
1198       .customIf([=](const LegalityQuery &Query) {
1199           const LLT EltTy = Query.Types[EltTypeIdx];
1200           const LLT VecTy = Query.Types[VecTypeIdx];
1201           const LLT IdxTy = Query.Types[IdxTypeIdx];
1202           return (EltTy.getSizeInBits() == 16 ||
1203                   EltTy.getSizeInBits() % 32 == 0) &&
1204                  VecTy.getSizeInBits() % 32 == 0 &&
1205                  VecTy.getSizeInBits() <= 1024 &&
1206                  IdxTy.getSizeInBits() == 32;
1207         })
1208       .clampScalar(EltTypeIdx, S32, S64)
1209       .clampScalar(VecTypeIdx, S32, S64)
1210       .clampScalar(IdxTypeIdx, S32, S32);
1211   }
1212 
1213   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1214     .unsupportedIf([=](const LegalityQuery &Query) {
1215         const LLT &EltTy = Query.Types[1].getElementType();
1216         return Query.Types[0] != EltTy;
1217       });
1218 
1219   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1220     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1221     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1222 
1223     // FIXME: Doesn't handle extract of illegal sizes.
1224     getActionDefinitionsBuilder(Op)
1225       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1226       // FIXME: Multiples of 16 should not be legal.
1227       .legalIf([=](const LegalityQuery &Query) {
1228           const LLT BigTy = Query.Types[BigTyIdx];
1229           const LLT LitTy = Query.Types[LitTyIdx];
1230           return (BigTy.getSizeInBits() % 32 == 0) &&
1231                  (LitTy.getSizeInBits() % 16 == 0);
1232         })
1233       .widenScalarIf(
1234         [=](const LegalityQuery &Query) {
1235           const LLT BigTy = Query.Types[BigTyIdx];
1236           return (BigTy.getScalarSizeInBits() < 16);
1237         },
1238         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1239       .widenScalarIf(
1240         [=](const LegalityQuery &Query) {
1241           const LLT LitTy = Query.Types[LitTyIdx];
1242           return (LitTy.getScalarSizeInBits() < 16);
1243         },
1244         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1245       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1246       .widenScalarToNextPow2(BigTyIdx, 32);
1247 
1248   }
1249 
1250   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1251     .legalForCartesianProduct(AllS32Vectors, {S32})
1252     .legalForCartesianProduct(AllS64Vectors, {S64})
1253     .clampNumElements(0, V16S32, V32S32)
1254     .clampNumElements(0, V2S64, V16S64)
1255     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1256 
1257   if (ST.hasScalarPackInsts()) {
1258     BuildVector
1259       // FIXME: Should probably widen s1 vectors straight to s32
1260       .minScalarOrElt(0, S16)
1261       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1262       .minScalar(1, S32);
1263 
1264     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1265       .legalFor({V2S16, S32})
1266       .lower();
1267     BuildVector.minScalarOrElt(0, S32);
1268   } else {
1269     BuildVector.customFor({V2S16, S16});
1270     BuildVector.minScalarOrElt(0, S32);
1271 
1272     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1273       .customFor({V2S16, S32})
1274       .lower();
1275   }
1276 
1277   BuildVector.legalIf(isRegisterType(0));
1278 
1279   // FIXME: Clamp maximum size
1280   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1281     .legalIf(isRegisterType(0));
1282 
1283   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1284   // pre-legalize.
1285   if (ST.hasVOP3PInsts()) {
1286     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1287       .customFor({V2S16, V2S16})
1288       .lower();
1289   } else
1290     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1291 
1292   // Merge/Unmerge
1293   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1294     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1295     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1296 
1297     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1298       const LLT Ty = Query.Types[TypeIdx];
1299       if (Ty.isVector()) {
1300         const LLT &EltTy = Ty.getElementType();
1301         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1302           return true;
1303         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1304           return true;
1305       }
1306       return false;
1307     };
1308 
1309     auto &Builder = getActionDefinitionsBuilder(Op)
1310       .lowerFor({{S16, V2S16}})
1311       .lowerIf([=](const LegalityQuery &Query) {
1312           const LLT BigTy = Query.Types[BigTyIdx];
1313           return BigTy.getSizeInBits() == 32;
1314         })
1315       // Try to widen to s16 first for small types.
1316       // TODO: Only do this on targets with legal s16 shifts
1317       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1318       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1319       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1320       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1321                            elementTypeIs(1, S16)),
1322                        changeTo(1, V2S16))
1323       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1324       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1325       // valid.
1326       .clampScalar(LitTyIdx, S32, S512)
1327       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1328       // Break up vectors with weird elements into scalars
1329       .fewerElementsIf(
1330         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1331         scalarize(0))
1332       .fewerElementsIf(
1333         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1334         scalarize(1))
1335       .clampScalar(BigTyIdx, S32, S1024);
1336 
1337     if (Op == G_MERGE_VALUES) {
1338       Builder.widenScalarIf(
1339         // TODO: Use 16-bit shifts if legal for 8-bit values?
1340         [=](const LegalityQuery &Query) {
1341           const LLT Ty = Query.Types[LitTyIdx];
1342           return Ty.getSizeInBits() < 32;
1343         },
1344         changeTo(LitTyIdx, S32));
1345     }
1346 
1347     Builder.widenScalarIf(
1348       [=](const LegalityQuery &Query) {
1349         const LLT Ty = Query.Types[BigTyIdx];
1350         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1351           Ty.getSizeInBits() % 16 != 0;
1352       },
1353       [=](const LegalityQuery &Query) {
1354         // Pick the next power of 2, or a multiple of 64 over 128.
1355         // Whichever is smaller.
1356         const LLT &Ty = Query.Types[BigTyIdx];
1357         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1358         if (NewSizeInBits >= 256) {
1359           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1360           if (RoundedTo < NewSizeInBits)
1361             NewSizeInBits = RoundedTo;
1362         }
1363         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1364       })
1365       .legalIf([=](const LegalityQuery &Query) {
1366           const LLT &BigTy = Query.Types[BigTyIdx];
1367           const LLT &LitTy = Query.Types[LitTyIdx];
1368 
1369           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1370             return false;
1371           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1372             return false;
1373 
1374           return BigTy.getSizeInBits() % 16 == 0 &&
1375                  LitTy.getSizeInBits() % 16 == 0 &&
1376                  BigTy.getSizeInBits() <= 1024;
1377         })
1378       // Any vectors left are the wrong size. Scalarize them.
1379       .scalarize(0)
1380       .scalarize(1);
1381   }
1382 
1383   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1384   // RegBankSelect.
1385   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1386     .legalFor({{S32}, {S64}});
1387 
1388   if (ST.hasVOP3PInsts()) {
1389     SextInReg.lowerFor({{V2S16}})
1390       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1391       // get more vector shift opportunities, since we'll get those when
1392       // expanded.
1393       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1394   } else if (ST.has16BitInsts()) {
1395     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1396   } else {
1397     // Prefer to promote to s32 before lowering if we don't have 16-bit
1398     // shifts. This avoid a lot of intermediate truncate and extend operations.
1399     SextInReg.lowerFor({{S32}, {S64}});
1400   }
1401 
1402   SextInReg
1403     .scalarize(0)
1404     .clampScalar(0, S32, S64)
1405     .lower();
1406 
1407   getActionDefinitionsBuilder(G_FSHR)
1408     .legalFor({{S32, S32}})
1409     .scalarize(0)
1410     .lower();
1411 
1412   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1413     .legalFor({S64});
1414 
1415   getActionDefinitionsBuilder({
1416       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1417       G_FCOPYSIGN,
1418 
1419       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1420       G_READ_REGISTER,
1421       G_WRITE_REGISTER,
1422 
1423       G_SADDO, G_SSUBO,
1424 
1425        // TODO: Implement
1426       G_FMINIMUM, G_FMAXIMUM,
1427       G_FSHL
1428     }).lower();
1429 
1430   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1431         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1432         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1433     .unsupported();
1434 
1435   computeTables();
1436   verify(*ST.getInstrInfo());
1437 }
1438 
1439 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1440                                          MachineRegisterInfo &MRI,
1441                                          MachineIRBuilder &B,
1442                                          GISelChangeObserver &Observer) const {
1443   switch (MI.getOpcode()) {
1444   case TargetOpcode::G_ADDRSPACE_CAST:
1445     return legalizeAddrSpaceCast(MI, MRI, B);
1446   case TargetOpcode::G_FRINT:
1447     return legalizeFrint(MI, MRI, B);
1448   case TargetOpcode::G_FCEIL:
1449     return legalizeFceil(MI, MRI, B);
1450   case TargetOpcode::G_INTRINSIC_TRUNC:
1451     return legalizeIntrinsicTrunc(MI, MRI, B);
1452   case TargetOpcode::G_SITOFP:
1453     return legalizeITOFP(MI, MRI, B, true);
1454   case TargetOpcode::G_UITOFP:
1455     return legalizeITOFP(MI, MRI, B, false);
1456   case TargetOpcode::G_FPTOSI:
1457     return legalizeFPTOI(MI, MRI, B, true);
1458   case TargetOpcode::G_FPTOUI:
1459     return legalizeFPTOI(MI, MRI, B, false);
1460   case TargetOpcode::G_FMINNUM:
1461   case TargetOpcode::G_FMAXNUM:
1462   case TargetOpcode::G_FMINNUM_IEEE:
1463   case TargetOpcode::G_FMAXNUM_IEEE:
1464     return legalizeMinNumMaxNum(MI, MRI, B);
1465   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1466     return legalizeExtractVectorElt(MI, MRI, B);
1467   case TargetOpcode::G_INSERT_VECTOR_ELT:
1468     return legalizeInsertVectorElt(MI, MRI, B);
1469   case TargetOpcode::G_SHUFFLE_VECTOR:
1470     return legalizeShuffleVector(MI, MRI, B);
1471   case TargetOpcode::G_FSIN:
1472   case TargetOpcode::G_FCOS:
1473     return legalizeSinCos(MI, MRI, B);
1474   case TargetOpcode::G_GLOBAL_VALUE:
1475     return legalizeGlobalValue(MI, MRI, B);
1476   case TargetOpcode::G_LOAD:
1477     return legalizeLoad(MI, MRI, B, Observer);
1478   case TargetOpcode::G_FMAD:
1479     return legalizeFMad(MI, MRI, B);
1480   case TargetOpcode::G_FDIV:
1481     return legalizeFDIV(MI, MRI, B);
1482   case TargetOpcode::G_UDIV:
1483   case TargetOpcode::G_UREM:
1484     return legalizeUDIV_UREM(MI, MRI, B);
1485   case TargetOpcode::G_SDIV:
1486   case TargetOpcode::G_SREM:
1487     return legalizeSDIV_SREM(MI, MRI, B);
1488   case TargetOpcode::G_ATOMIC_CMPXCHG:
1489     return legalizeAtomicCmpXChg(MI, MRI, B);
1490   case TargetOpcode::G_FLOG:
1491     return legalizeFlog(MI, B, numbers::ln2f);
1492   case TargetOpcode::G_FLOG10:
1493     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1494   case TargetOpcode::G_FEXP:
1495     return legalizeFExp(MI, B);
1496   case TargetOpcode::G_FPOW:
1497     return legalizeFPow(MI, B);
1498   case TargetOpcode::G_FFLOOR:
1499     return legalizeFFloor(MI, MRI, B);
1500   case TargetOpcode::G_BUILD_VECTOR:
1501     return legalizeBuildVector(MI, MRI, B);
1502   default:
1503     return false;
1504   }
1505 
1506   llvm_unreachable("expected switch to return");
1507 }
1508 
1509 Register AMDGPULegalizerInfo::getSegmentAperture(
1510   unsigned AS,
1511   MachineRegisterInfo &MRI,
1512   MachineIRBuilder &B) const {
1513   MachineFunction &MF = B.getMF();
1514   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1515   const LLT S32 = LLT::scalar(32);
1516 
1517   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1518 
1519   if (ST.hasApertureRegs()) {
1520     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1521     // getreg.
1522     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1523         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1524         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1525     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1526         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1527         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1528     unsigned Encoding =
1529         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1530         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1531         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1532 
1533     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1534 
1535     B.buildInstr(AMDGPU::S_GETREG_B32)
1536       .addDef(GetReg)
1537       .addImm(Encoding);
1538     MRI.setType(GetReg, S32);
1539 
1540     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1541     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1542   }
1543 
1544   Register QueuePtr = MRI.createGenericVirtualRegister(
1545     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1546 
1547   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1548   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1549     return Register();
1550 
1551   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1552   // private_segment_aperture_base_hi.
1553   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1554 
1555   // TODO: can we be smarter about machine pointer info?
1556   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1557   MachineMemOperand *MMO = MF.getMachineMemOperand(
1558       PtrInfo,
1559       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1560           MachineMemOperand::MOInvariant,
1561       4, commonAlignment(Align(64), StructOffset));
1562 
1563   Register LoadAddr;
1564 
1565   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1566   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1567 }
1568 
1569 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1570   MachineInstr &MI, MachineRegisterInfo &MRI,
1571   MachineIRBuilder &B) const {
1572   MachineFunction &MF = B.getMF();
1573 
1574   const LLT S32 = LLT::scalar(32);
1575   Register Dst = MI.getOperand(0).getReg();
1576   Register Src = MI.getOperand(1).getReg();
1577 
1578   LLT DstTy = MRI.getType(Dst);
1579   LLT SrcTy = MRI.getType(Src);
1580   unsigned DestAS = DstTy.getAddressSpace();
1581   unsigned SrcAS = SrcTy.getAddressSpace();
1582 
1583   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1584   // vector element.
1585   assert(!DstTy.isVector());
1586 
1587   const AMDGPUTargetMachine &TM
1588     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1589 
1590   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1591   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1592     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1593     return true;
1594   }
1595 
1596   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1597     // Truncate.
1598     B.buildExtract(Dst, Src, 0);
1599     MI.eraseFromParent();
1600     return true;
1601   }
1602 
1603   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1604     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1605     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1606 
1607     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1608     // another. Merge operands are required to be the same type, but creating an
1609     // extra ptrtoint would be kind of pointless.
1610     auto HighAddr = B.buildConstant(
1611       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1612     B.buildMerge(Dst, {Src, HighAddr});
1613     MI.eraseFromParent();
1614     return true;
1615   }
1616 
1617   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1618     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1619            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1620     unsigned NullVal = TM.getNullPointerValue(DestAS);
1621 
1622     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1623     auto FlatNull = B.buildConstant(SrcTy, 0);
1624 
1625     // Extract low 32-bits of the pointer.
1626     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1627 
1628     auto CmpRes =
1629         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1630     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1631 
1632     MI.eraseFromParent();
1633     return true;
1634   }
1635 
1636   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1637     return false;
1638 
1639   if (!ST.hasFlatAddressSpace())
1640     return false;
1641 
1642   auto SegmentNull =
1643       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1644   auto FlatNull =
1645       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1646 
1647   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1648   if (!ApertureReg.isValid())
1649     return false;
1650 
1651   auto CmpRes =
1652       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1653 
1654   // Coerce the type of the low half of the result so we can use merge_values.
1655   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1656 
1657   // TODO: Should we allow mismatched types but matching sizes in merges to
1658   // avoid the ptrtoint?
1659   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1660   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1661 
1662   MI.eraseFromParent();
1663   return true;
1664 }
1665 
1666 bool AMDGPULegalizerInfo::legalizeFrint(
1667   MachineInstr &MI, MachineRegisterInfo &MRI,
1668   MachineIRBuilder &B) const {
1669   Register Src = MI.getOperand(1).getReg();
1670   LLT Ty = MRI.getType(Src);
1671   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1672 
1673   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1674   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1675 
1676   auto C1 = B.buildFConstant(Ty, C1Val);
1677   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1678 
1679   // TODO: Should this propagate fast-math-flags?
1680   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1681   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1682 
1683   auto C2 = B.buildFConstant(Ty, C2Val);
1684   auto Fabs = B.buildFAbs(Ty, Src);
1685 
1686   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1687   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1688   return true;
1689 }
1690 
1691 bool AMDGPULegalizerInfo::legalizeFceil(
1692   MachineInstr &MI, MachineRegisterInfo &MRI,
1693   MachineIRBuilder &B) const {
1694 
1695   const LLT S1 = LLT::scalar(1);
1696   const LLT S64 = LLT::scalar(64);
1697 
1698   Register Src = MI.getOperand(1).getReg();
1699   assert(MRI.getType(Src) == S64);
1700 
1701   // result = trunc(src)
1702   // if (src > 0.0 && src != result)
1703   //   result += 1.0
1704 
1705   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1706 
1707   const auto Zero = B.buildFConstant(S64, 0.0);
1708   const auto One = B.buildFConstant(S64, 1.0);
1709   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1710   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1711   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1712   auto Add = B.buildSelect(S64, And, One, Zero);
1713 
1714   // TODO: Should this propagate fast-math-flags?
1715   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1716   return true;
1717 }
1718 
1719 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1720                                               MachineIRBuilder &B) {
1721   const unsigned FractBits = 52;
1722   const unsigned ExpBits = 11;
1723   LLT S32 = LLT::scalar(32);
1724 
1725   auto Const0 = B.buildConstant(S32, FractBits - 32);
1726   auto Const1 = B.buildConstant(S32, ExpBits);
1727 
1728   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1729     .addUse(Const0.getReg(0))
1730     .addUse(Const1.getReg(0));
1731 
1732   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1733 }
1734 
1735 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1736   MachineInstr &MI, MachineRegisterInfo &MRI,
1737   MachineIRBuilder &B) const {
1738   const LLT S1 = LLT::scalar(1);
1739   const LLT S32 = LLT::scalar(32);
1740   const LLT S64 = LLT::scalar(64);
1741 
1742   Register Src = MI.getOperand(1).getReg();
1743   assert(MRI.getType(Src) == S64);
1744 
1745   // TODO: Should this use extract since the low half is unused?
1746   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1747   Register Hi = Unmerge.getReg(1);
1748 
1749   // Extract the upper half, since this is where we will find the sign and
1750   // exponent.
1751   auto Exp = extractF64Exponent(Hi, B);
1752 
1753   const unsigned FractBits = 52;
1754 
1755   // Extract the sign bit.
1756   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1757   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1758 
1759   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1760 
1761   const auto Zero32 = B.buildConstant(S32, 0);
1762 
1763   // Extend back to 64-bits.
1764   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1765 
1766   auto Shr = B.buildAShr(S64, FractMask, Exp);
1767   auto Not = B.buildNot(S64, Shr);
1768   auto Tmp0 = B.buildAnd(S64, Src, Not);
1769   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1770 
1771   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1772   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1773 
1774   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1775   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1776   return true;
1777 }
1778 
1779 bool AMDGPULegalizerInfo::legalizeITOFP(
1780   MachineInstr &MI, MachineRegisterInfo &MRI,
1781   MachineIRBuilder &B, bool Signed) const {
1782 
1783   Register Dst = MI.getOperand(0).getReg();
1784   Register Src = MI.getOperand(1).getReg();
1785 
1786   const LLT S64 = LLT::scalar(64);
1787   const LLT S32 = LLT::scalar(32);
1788 
1789   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1790 
1791   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1792 
1793   auto CvtHi = Signed ?
1794     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1795     B.buildUITOFP(S64, Unmerge.getReg(1));
1796 
1797   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1798 
1799   auto ThirtyTwo = B.buildConstant(S32, 32);
1800   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1801     .addUse(CvtHi.getReg(0))
1802     .addUse(ThirtyTwo.getReg(0));
1803 
1804   // TODO: Should this propagate fast-math-flags?
1805   B.buildFAdd(Dst, LdExp, CvtLo);
1806   MI.eraseFromParent();
1807   return true;
1808 }
1809 
1810 // TODO: Copied from DAG implementation. Verify logic and document how this
1811 // actually works.
1812 bool AMDGPULegalizerInfo::legalizeFPTOI(
1813   MachineInstr &MI, MachineRegisterInfo &MRI,
1814   MachineIRBuilder &B, bool Signed) const {
1815 
1816   Register Dst = MI.getOperand(0).getReg();
1817   Register Src = MI.getOperand(1).getReg();
1818 
1819   const LLT S64 = LLT::scalar(64);
1820   const LLT S32 = LLT::scalar(32);
1821 
1822   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1823 
1824   unsigned Flags = MI.getFlags();
1825 
1826   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1827   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1828   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1829 
1830   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1831   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1832   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1833 
1834   auto Hi = Signed ?
1835     B.buildFPTOSI(S32, FloorMul) :
1836     B.buildFPTOUI(S32, FloorMul);
1837   auto Lo = B.buildFPTOUI(S32, Fma);
1838 
1839   B.buildMerge(Dst, { Lo, Hi });
1840   MI.eraseFromParent();
1841 
1842   return true;
1843 }
1844 
1845 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1846   MachineInstr &MI, MachineRegisterInfo &MRI,
1847   MachineIRBuilder &B) const {
1848   MachineFunction &MF = B.getMF();
1849   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1850 
1851   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1852                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1853 
1854   // With ieee_mode disabled, the instructions have the correct behavior
1855   // already for G_FMINNUM/G_FMAXNUM
1856   if (!MFI->getMode().IEEE)
1857     return !IsIEEEOp;
1858 
1859   if (IsIEEEOp)
1860     return true;
1861 
1862   MachineIRBuilder HelperBuilder(MI);
1863   GISelObserverWrapper DummyObserver;
1864   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1865   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1866 }
1867 
1868 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1869   MachineInstr &MI, MachineRegisterInfo &MRI,
1870   MachineIRBuilder &B) const {
1871   // TODO: Should move some of this into LegalizerHelper.
1872 
1873   // TODO: Promote dynamic indexing of s16 to s32
1874 
1875   // FIXME: Artifact combiner probably should have replaced the truncated
1876   // constant before this, so we shouldn't need
1877   // getConstantVRegValWithLookThrough.
1878   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1879     MI.getOperand(2).getReg(), MRI);
1880   if (!IdxVal) // Dynamic case will be selected to register indexing.
1881     return true;
1882 
1883   Register Dst = MI.getOperand(0).getReg();
1884   Register Vec = MI.getOperand(1).getReg();
1885 
1886   LLT VecTy = MRI.getType(Vec);
1887   LLT EltTy = VecTy.getElementType();
1888   assert(EltTy == MRI.getType(Dst));
1889 
1890   if (IdxVal->Value < VecTy.getNumElements())
1891     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1892   else
1893     B.buildUndef(Dst);
1894 
1895   MI.eraseFromParent();
1896   return true;
1897 }
1898 
1899 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1900   MachineInstr &MI, MachineRegisterInfo &MRI,
1901   MachineIRBuilder &B) const {
1902   // TODO: Should move some of this into LegalizerHelper.
1903 
1904   // TODO: Promote dynamic indexing of s16 to s32
1905 
1906   // FIXME: Artifact combiner probably should have replaced the truncated
1907   // constant before this, so we shouldn't need
1908   // getConstantVRegValWithLookThrough.
1909   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1910     MI.getOperand(3).getReg(), MRI);
1911   if (!IdxVal) // Dynamic case will be selected to register indexing.
1912     return true;
1913 
1914   Register Dst = MI.getOperand(0).getReg();
1915   Register Vec = MI.getOperand(1).getReg();
1916   Register Ins = MI.getOperand(2).getReg();
1917 
1918   LLT VecTy = MRI.getType(Vec);
1919   LLT EltTy = VecTy.getElementType();
1920   assert(EltTy == MRI.getType(Ins));
1921 
1922   if (IdxVal->Value < VecTy.getNumElements())
1923     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1924   else
1925     B.buildUndef(Dst);
1926 
1927   MI.eraseFromParent();
1928   return true;
1929 }
1930 
1931 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1932   MachineInstr &MI, MachineRegisterInfo &MRI,
1933   MachineIRBuilder &B) const {
1934   const LLT V2S16 = LLT::vector(2, 16);
1935 
1936   Register Dst = MI.getOperand(0).getReg();
1937   Register Src0 = MI.getOperand(1).getReg();
1938   LLT DstTy = MRI.getType(Dst);
1939   LLT SrcTy = MRI.getType(Src0);
1940 
1941   if (SrcTy == V2S16 && DstTy == V2S16 &&
1942       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1943     return true;
1944 
1945   MachineIRBuilder HelperBuilder(MI);
1946   GISelObserverWrapper DummyObserver;
1947   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1948   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1949 }
1950 
1951 bool AMDGPULegalizerInfo::legalizeSinCos(
1952   MachineInstr &MI, MachineRegisterInfo &MRI,
1953   MachineIRBuilder &B) const {
1954 
1955   Register DstReg = MI.getOperand(0).getReg();
1956   Register SrcReg = MI.getOperand(1).getReg();
1957   LLT Ty = MRI.getType(DstReg);
1958   unsigned Flags = MI.getFlags();
1959 
1960   Register TrigVal;
1961   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1962   if (ST.hasTrigReducedRange()) {
1963     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1964     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1965       .addUse(MulVal.getReg(0))
1966       .setMIFlags(Flags).getReg(0);
1967   } else
1968     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1969 
1970   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1971     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1972   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1973     .addUse(TrigVal)
1974     .setMIFlags(Flags);
1975   MI.eraseFromParent();
1976   return true;
1977 }
1978 
1979 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1980   Register DstReg, LLT PtrTy,
1981   MachineIRBuilder &B, const GlobalValue *GV,
1982   unsigned Offset, unsigned GAFlags) const {
1983   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1984   // to the following code sequence:
1985   //
1986   // For constant address space:
1987   //   s_getpc_b64 s[0:1]
1988   //   s_add_u32 s0, s0, $symbol
1989   //   s_addc_u32 s1, s1, 0
1990   //
1991   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1992   //   a fixup or relocation is emitted to replace $symbol with a literal
1993   //   constant, which is a pc-relative offset from the encoding of the $symbol
1994   //   operand to the global variable.
1995   //
1996   // For global address space:
1997   //   s_getpc_b64 s[0:1]
1998   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1999   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2000   //
2001   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2002   //   fixups or relocations are emitted to replace $symbol@*@lo and
2003   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2004   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2005   //   operand to the global variable.
2006   //
2007   // What we want here is an offset from the value returned by s_getpc
2008   // (which is the address of the s_add_u32 instruction) to the global
2009   // variable, but since the encoding of $symbol starts 4 bytes after the start
2010   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2011   // small. This requires us to add 4 to the global variable offset in order to
2012   // compute the correct address.
2013 
2014   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2015 
2016   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2017     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2018 
2019   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2020     .addDef(PCReg);
2021 
2022   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2023   if (GAFlags == SIInstrInfo::MO_NONE)
2024     MIB.addImm(0);
2025   else
2026     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2027 
2028   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2029 
2030   if (PtrTy.getSizeInBits() == 32)
2031     B.buildExtract(DstReg, PCReg, 0);
2032   return true;
2033  }
2034 
2035 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2036   MachineInstr &MI, MachineRegisterInfo &MRI,
2037   MachineIRBuilder &B) const {
2038   Register DstReg = MI.getOperand(0).getReg();
2039   LLT Ty = MRI.getType(DstReg);
2040   unsigned AS = Ty.getAddressSpace();
2041 
2042   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2043   MachineFunction &MF = B.getMF();
2044   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2045 
2046   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2047     if (!MFI->isEntryFunction()) {
2048       const Function &Fn = MF.getFunction();
2049       DiagnosticInfoUnsupported BadLDSDecl(
2050         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2051         DS_Warning);
2052       Fn.getContext().diagnose(BadLDSDecl);
2053 
2054       // We currently don't have a way to correctly allocate LDS objects that
2055       // aren't directly associated with a kernel. We do force inlining of
2056       // functions that use local objects. However, if these dead functions are
2057       // not eliminated, we don't want a compile time error. Just emit a warning
2058       // and a trap, since there should be no callable path here.
2059       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2060       B.buildUndef(DstReg);
2061       MI.eraseFromParent();
2062       return true;
2063     }
2064 
2065     // TODO: We could emit code to handle the initialization somewhere.
2066     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2067       const SITargetLowering *TLI = ST.getTargetLowering();
2068       if (!TLI->shouldUseLDSConstAddress(GV)) {
2069         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2070         return true; // Leave in place;
2071       }
2072 
2073       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2074       MI.eraseFromParent();
2075       return true;
2076     }
2077 
2078     const Function &Fn = MF.getFunction();
2079     DiagnosticInfoUnsupported BadInit(
2080       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2081     Fn.getContext().diagnose(BadInit);
2082     return true;
2083   }
2084 
2085   const SITargetLowering *TLI = ST.getTargetLowering();
2086 
2087   if (TLI->shouldEmitFixup(GV)) {
2088     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2089     MI.eraseFromParent();
2090     return true;
2091   }
2092 
2093   if (TLI->shouldEmitPCReloc(GV)) {
2094     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2095     MI.eraseFromParent();
2096     return true;
2097   }
2098 
2099   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2100   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2101 
2102   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2103       MachinePointerInfo::getGOT(MF),
2104       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2105           MachineMemOperand::MOInvariant,
2106       8 /*Size*/, Align(8));
2107 
2108   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2109 
2110   if (Ty.getSizeInBits() == 32) {
2111     // Truncate if this is a 32-bit constant adrdess.
2112     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2113     B.buildExtract(DstReg, Load, 0);
2114   } else
2115     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2116 
2117   MI.eraseFromParent();
2118   return true;
2119 }
2120 
2121 bool AMDGPULegalizerInfo::legalizeLoad(
2122   MachineInstr &MI, MachineRegisterInfo &MRI,
2123   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2124   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2125   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2126   Observer.changingInstr(MI);
2127   MI.getOperand(1).setReg(Cast.getReg(0));
2128   Observer.changedInstr(MI);
2129   return true;
2130 }
2131 
2132 bool AMDGPULegalizerInfo::legalizeFMad(
2133   MachineInstr &MI, MachineRegisterInfo &MRI,
2134   MachineIRBuilder &B) const {
2135   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2136   assert(Ty.isScalar());
2137 
2138   MachineFunction &MF = B.getMF();
2139   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2140 
2141   // TODO: Always legal with future ftz flag.
2142   // FIXME: Do we need just output?
2143   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2144     return true;
2145   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2146     return true;
2147 
2148   MachineIRBuilder HelperBuilder(MI);
2149   GISelObserverWrapper DummyObserver;
2150   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2151   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2152 }
2153 
2154 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2155   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2156   Register DstReg = MI.getOperand(0).getReg();
2157   Register PtrReg = MI.getOperand(1).getReg();
2158   Register CmpVal = MI.getOperand(2).getReg();
2159   Register NewVal = MI.getOperand(3).getReg();
2160 
2161   assert(SITargetLowering::isFlatGlobalAddrSpace(
2162            MRI.getType(PtrReg).getAddressSpace()) &&
2163          "this should not have been custom lowered");
2164 
2165   LLT ValTy = MRI.getType(CmpVal);
2166   LLT VecTy = LLT::vector(2, ValTy);
2167 
2168   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2169 
2170   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2171     .addDef(DstReg)
2172     .addUse(PtrReg)
2173     .addUse(PackedVal)
2174     .setMemRefs(MI.memoperands());
2175 
2176   MI.eraseFromParent();
2177   return true;
2178 }
2179 
2180 bool AMDGPULegalizerInfo::legalizeFlog(
2181   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2182   Register Dst = MI.getOperand(0).getReg();
2183   Register Src = MI.getOperand(1).getReg();
2184   LLT Ty = B.getMRI()->getType(Dst);
2185   unsigned Flags = MI.getFlags();
2186 
2187   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2188   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2189 
2190   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2191   MI.eraseFromParent();
2192   return true;
2193 }
2194 
2195 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2196                                        MachineIRBuilder &B) const {
2197   Register Dst = MI.getOperand(0).getReg();
2198   Register Src = MI.getOperand(1).getReg();
2199   unsigned Flags = MI.getFlags();
2200   LLT Ty = B.getMRI()->getType(Dst);
2201 
2202   auto K = B.buildFConstant(Ty, numbers::log2e);
2203   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2204   B.buildFExp2(Dst, Mul, Flags);
2205   MI.eraseFromParent();
2206   return true;
2207 }
2208 
2209 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2210                                        MachineIRBuilder &B) const {
2211   Register Dst = MI.getOperand(0).getReg();
2212   Register Src0 = MI.getOperand(1).getReg();
2213   Register Src1 = MI.getOperand(2).getReg();
2214   unsigned Flags = MI.getFlags();
2215   LLT Ty = B.getMRI()->getType(Dst);
2216   const LLT S16 = LLT::scalar(16);
2217   const LLT S32 = LLT::scalar(32);
2218 
2219   if (Ty == S32) {
2220     auto Log = B.buildFLog2(S32, Src0, Flags);
2221     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2222       .addUse(Log.getReg(0))
2223       .addUse(Src1)
2224       .setMIFlags(Flags);
2225     B.buildFExp2(Dst, Mul, Flags);
2226   } else if (Ty == S16) {
2227     // There's no f16 fmul_legacy, so we need to convert for it.
2228     auto Log = B.buildFLog2(S16, Src0, Flags);
2229     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2230     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2231     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2232       .addUse(Ext0.getReg(0))
2233       .addUse(Ext1.getReg(0))
2234       .setMIFlags(Flags);
2235 
2236     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2237   } else
2238     return false;
2239 
2240   MI.eraseFromParent();
2241   return true;
2242 }
2243 
2244 // Find a source register, ignoring any possible source modifiers.
2245 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2246   Register ModSrc = OrigSrc;
2247   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2248     ModSrc = SrcFNeg->getOperand(1).getReg();
2249     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2250       ModSrc = SrcFAbs->getOperand(1).getReg();
2251   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2252     ModSrc = SrcFAbs->getOperand(1).getReg();
2253   return ModSrc;
2254 }
2255 
2256 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2257                                          MachineRegisterInfo &MRI,
2258                                          MachineIRBuilder &B) const {
2259 
2260   const LLT S1 = LLT::scalar(1);
2261   const LLT S64 = LLT::scalar(64);
2262   Register Dst = MI.getOperand(0).getReg();
2263   Register OrigSrc = MI.getOperand(1).getReg();
2264   unsigned Flags = MI.getFlags();
2265   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2266          "this should not have been custom lowered");
2267 
2268   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2269   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2270   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2271   // V_FRACT bug is:
2272   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2273   //
2274   // Convert floor(x) to (x - fract(x))
2275 
2276   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2277     .addUse(OrigSrc)
2278     .setMIFlags(Flags);
2279 
2280   // Give source modifier matching some assistance before obscuring a foldable
2281   // pattern.
2282 
2283   // TODO: We can avoid the neg on the fract? The input sign to fract
2284   // shouldn't matter?
2285   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2286 
2287   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2288 
2289   Register Min = MRI.createGenericVirtualRegister(S64);
2290 
2291   // We don't need to concern ourselves with the snan handling difference, so
2292   // use the one which will directly select.
2293   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2294   if (MFI->getMode().IEEE)
2295     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2296   else
2297     B.buildFMinNum(Min, Fract, Const, Flags);
2298 
2299   Register CorrectedFract = Min;
2300   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2301     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2302     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2303   }
2304 
2305   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2306   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2307 
2308   MI.eraseFromParent();
2309   return true;
2310 }
2311 
2312 // Turn an illegal packed v2s16 build vector into bit operations.
2313 // TODO: This should probably be a bitcast action in LegalizerHelper.
2314 bool AMDGPULegalizerInfo::legalizeBuildVector(
2315   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2316   Register Dst = MI.getOperand(0).getReg();
2317   const LLT S32 = LLT::scalar(32);
2318   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2319 
2320   Register Src0 = MI.getOperand(1).getReg();
2321   Register Src1 = MI.getOperand(2).getReg();
2322   assert(MRI.getType(Src0) == LLT::scalar(16));
2323 
2324   auto Merge = B.buildMerge(S32, {Src0, Src1});
2325   B.buildBitcast(Dst, Merge);
2326 
2327   MI.eraseFromParent();
2328   return true;
2329 }
2330 
2331 // Return the use branch instruction, otherwise null if the usage is invalid.
2332 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2333                                        MachineRegisterInfo &MRI,
2334                                        MachineInstr *&Br,
2335                                        MachineBasicBlock *&UncondBrTarget) {
2336   Register CondDef = MI.getOperand(0).getReg();
2337   if (!MRI.hasOneNonDBGUse(CondDef))
2338     return nullptr;
2339 
2340   MachineBasicBlock *Parent = MI.getParent();
2341   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2342   if (UseMI.getParent() != Parent ||
2343       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2344     return nullptr;
2345 
2346   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2347   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2348   if (Next == Parent->end()) {
2349     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2350     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2351       return nullptr;
2352     UncondBrTarget = &*NextMBB;
2353   } else {
2354     if (Next->getOpcode() != AMDGPU::G_BR)
2355       return nullptr;
2356     Br = &*Next;
2357     UncondBrTarget = Br->getOperand(0).getMBB();
2358   }
2359 
2360   return &UseMI;
2361 }
2362 
2363 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2364                                                MachineRegisterInfo &MRI,
2365                                                Register LiveIn,
2366                                                Register PhyReg) const {
2367   assert(PhyReg.isPhysical() && "Physical register expected");
2368 
2369   // Insert the live-in copy, if required, by defining destination virtual
2370   // register.
2371   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2372   if (!MRI.getVRegDef(LiveIn)) {
2373     // FIXME: Should have scoped insert pt
2374     MachineBasicBlock &OrigInsBB = B.getMBB();
2375     auto OrigInsPt = B.getInsertPt();
2376 
2377     MachineBasicBlock &EntryMBB = B.getMF().front();
2378     EntryMBB.addLiveIn(PhyReg);
2379     B.setInsertPt(EntryMBB, EntryMBB.begin());
2380     B.buildCopy(LiveIn, PhyReg);
2381 
2382     B.setInsertPt(OrigInsBB, OrigInsPt);
2383   }
2384 
2385   return LiveIn;
2386 }
2387 
2388 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2389                                                 MachineRegisterInfo &MRI,
2390                                                 Register PhyReg, LLT Ty,
2391                                                 bool InsertLiveInCopy) const {
2392   assert(PhyReg.isPhysical() && "Physical register expected");
2393 
2394   // Get or create virtual live-in regester
2395   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2396   if (!LiveIn) {
2397     LiveIn = MRI.createGenericVirtualRegister(Ty);
2398     MRI.addLiveIn(PhyReg, LiveIn);
2399   }
2400 
2401   // When the actual true copy required is from virtual register to physical
2402   // register (to be inserted later), live-in copy insertion from physical
2403   // to register virtual register is not required
2404   if (!InsertLiveInCopy)
2405     return LiveIn;
2406 
2407   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2408 }
2409 
2410 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2411     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2412   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2413   const ArgDescriptor *Arg;
2414   const TargetRegisterClass *RC;
2415   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2416   if (!Arg) {
2417     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2418     return nullptr;
2419   }
2420   return Arg;
2421 }
2422 
2423 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2424                                          const ArgDescriptor *Arg) const {
2425   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2426     return false; // TODO: Handle these
2427 
2428   Register SrcReg = Arg->getRegister();
2429   assert(SrcReg.isPhysical() && "Physical register expected");
2430   assert(DstReg.isVirtual() && "Virtual register expected");
2431 
2432   MachineRegisterInfo &MRI = *B.getMRI();
2433 
2434   LLT Ty = MRI.getType(DstReg);
2435   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2436 
2437   if (Arg->isMasked()) {
2438     // TODO: Should we try to emit this once in the entry block?
2439     const LLT S32 = LLT::scalar(32);
2440     const unsigned Mask = Arg->getMask();
2441     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2442 
2443     Register AndMaskSrc = LiveIn;
2444 
2445     if (Shift != 0) {
2446       auto ShiftAmt = B.buildConstant(S32, Shift);
2447       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2448     }
2449 
2450     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2451   } else {
2452     B.buildCopy(DstReg, LiveIn);
2453   }
2454 
2455   return true;
2456 }
2457 
2458 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2459     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2460     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2461 
2462   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2463   if (!Arg)
2464     return false;
2465 
2466   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2467     return false;
2468 
2469   MI.eraseFromParent();
2470   return true;
2471 }
2472 
2473 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2474                                        MachineRegisterInfo &MRI,
2475                                        MachineIRBuilder &B) const {
2476   Register Dst = MI.getOperand(0).getReg();
2477   LLT DstTy = MRI.getType(Dst);
2478   LLT S16 = LLT::scalar(16);
2479   LLT S32 = LLT::scalar(32);
2480   LLT S64 = LLT::scalar(64);
2481 
2482   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2483     return true;
2484 
2485   if (DstTy == S16)
2486     return legalizeFDIV16(MI, MRI, B);
2487   if (DstTy == S32)
2488     return legalizeFDIV32(MI, MRI, B);
2489   if (DstTy == S64)
2490     return legalizeFDIV64(MI, MRI, B);
2491 
2492   return false;
2493 }
2494 
2495 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2496   const LLT S32 = LLT::scalar(32);
2497 
2498   auto Cvt0 = B.buildUITOFP(S32, Src);
2499   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2500   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2501   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2502   return B.buildFPTOUI(S32, Mul).getReg(0);
2503 }
2504 
2505 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2506                                                   Register DstReg,
2507                                                   Register Num,
2508                                                   Register Den,
2509                                                   bool IsRem) const {
2510   const LLT S1 = LLT::scalar(1);
2511   const LLT S32 = LLT::scalar(32);
2512 
2513   // RCP =  URECIP(Den) = 2^32 / Den + e
2514   // e is rounding error.
2515   auto RCP = buildDivRCP(B, Den);
2516 
2517   // RCP_LO = mul(RCP, Den)
2518   auto RCP_LO = B.buildMul(S32, RCP, Den);
2519 
2520   // RCP_HI = mulhu (RCP, Den) */
2521   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2522 
2523   // NEG_RCP_LO = -RCP_LO
2524   auto Zero = B.buildConstant(S32, 0);
2525   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2526 
2527   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2528   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2529   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2530 
2531   // Calculate the rounding error from the URECIP instruction
2532   // E = mulhu(ABS_RCP_LO, RCP)
2533   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2534 
2535   // RCP_A_E = RCP + E
2536   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2537 
2538   // RCP_S_E = RCP - E
2539   auto RCP_S_E = B.buildSub(S32, RCP, E);
2540 
2541   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2542   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2543 
2544   // Quotient = mulhu(Tmp0, Num)stmp
2545   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2546 
2547   // Num_S_Remainder = Quotient * Den
2548   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2549 
2550   // Remainder = Num - Num_S_Remainder
2551   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2552 
2553   // Remainder_GE_Den = Remainder >= Den
2554   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2555 
2556   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2557   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2558                                        Num, Num_S_Remainder);
2559 
2560   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2561   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2562 
2563   // Calculate Division result:
2564 
2565   // Quotient_A_One = Quotient + 1
2566   auto One = B.buildConstant(S32, 1);
2567   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2568 
2569   // Quotient_S_One = Quotient - 1
2570   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2571 
2572   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2573   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2574 
2575   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2576   if (IsRem) {
2577     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2578 
2579     // Calculate Rem result:
2580     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2581 
2582     // Remainder_A_Den = Remainder + Den
2583     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2584 
2585     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2586     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2587 
2588     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2589     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2590   } else {
2591     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2592   }
2593 }
2594 
2595 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2596                                               MachineRegisterInfo &MRI,
2597                                               MachineIRBuilder &B) const {
2598   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2599   Register DstReg = MI.getOperand(0).getReg();
2600   Register Num = MI.getOperand(1).getReg();
2601   Register Den = MI.getOperand(2).getReg();
2602   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2603   MI.eraseFromParent();
2604   return true;
2605 }
2606 
2607 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2608 //
2609 // Return lo, hi of result
2610 //
2611 // %cvt.lo = G_UITOFP Val.lo
2612 // %cvt.hi = G_UITOFP Val.hi
2613 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2614 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2615 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2616 // %mul2 = G_FMUL %mul1, 2**(-32)
2617 // %trunc = G_INTRINSIC_TRUNC %mul2
2618 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2619 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2620 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2621                                                        Register Val) {
2622   const LLT S32 = LLT::scalar(32);
2623   auto Unmerge = B.buildUnmerge(S32, Val);
2624 
2625   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2626   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2627 
2628   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2629                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2630 
2631   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2632   auto Mul1 =
2633       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2634 
2635   // 2**(-32)
2636   auto Mul2 =
2637       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2638   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2639 
2640   // -(2**32)
2641   auto Mad2 = B.buildFMAD(S32, Trunc,
2642                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2643 
2644   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2645   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2646 
2647   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2648 }
2649 
2650 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2651                                               MachineRegisterInfo &MRI,
2652                                               MachineIRBuilder &B) const {
2653   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2654   const LLT S32 = LLT::scalar(32);
2655   const LLT S64 = LLT::scalar(64);
2656   const LLT S1 = LLT::scalar(1);
2657   Register Numer = MI.getOperand(1).getReg();
2658   Register Denom = MI.getOperand(2).getReg();
2659   Register RcpLo, RcpHi;
2660 
2661   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2662 
2663   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2664 
2665   auto Zero64 = B.buildConstant(S64, 0);
2666   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2667 
2668   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2669   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2670 
2671   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2672   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2673   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2674 
2675   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2676   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2677   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2678   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2679 
2680   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2681   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2682   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2683   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2684   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2685 
2686   auto Zero32 = B.buildConstant(S32, 0);
2687   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2688   auto Add2_HiC =
2689       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2690   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2691   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2692 
2693   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2694   Register NumerLo = UnmergeNumer.getReg(0);
2695   Register NumerHi = UnmergeNumer.getReg(1);
2696 
2697   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2698   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2699   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2700   Register Mul3_Lo = UnmergeMul3.getReg(0);
2701   Register Mul3_Hi = UnmergeMul3.getReg(1);
2702   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2703   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2704   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2705   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2706 
2707   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2708   Register DenomLo = UnmergeDenom.getReg(0);
2709   Register DenomHi = UnmergeDenom.getReg(1);
2710 
2711   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2712   auto C1 = B.buildSExt(S32, CmpHi);
2713 
2714   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2715   auto C2 = B.buildSExt(S32, CmpLo);
2716 
2717   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2718   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2719 
2720   // TODO: Here and below portions of the code can be enclosed into if/endif.
2721   // Currently control flow is unconditional and we have 4 selects after
2722   // potential endif to substitute PHIs.
2723 
2724   // if C3 != 0 ...
2725   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2726   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2727   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2728   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2729 
2730   auto One64 = B.buildConstant(S64, 1);
2731   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2732 
2733   auto C4 =
2734       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2735   auto C5 =
2736       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2737   auto C6 = B.buildSelect(
2738       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2739 
2740   // if (C6 != 0)
2741   auto Add4 = B.buildAdd(S64, Add3, One64);
2742   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2743 
2744   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2745   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2746   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2747 
2748   // endif C6
2749   // endif C3
2750 
2751   if (IsDiv) {
2752     auto Sel1 = B.buildSelect(
2753         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2754     B.buildSelect(MI.getOperand(0),
2755                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2756   } else {
2757     auto Sel2 = B.buildSelect(
2758         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2759     B.buildSelect(MI.getOperand(0),
2760                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2761   }
2762 
2763   MI.eraseFromParent();
2764   return true;
2765 }
2766 
2767 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2768                                             MachineRegisterInfo &MRI,
2769                                             MachineIRBuilder &B) const {
2770   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2771   if (Ty == LLT::scalar(32))
2772     return legalizeUDIV_UREM32(MI, MRI, B);
2773   if (Ty == LLT::scalar(64))
2774     return legalizeUDIV_UREM64(MI, MRI, B);
2775   return false;
2776 }
2777 
2778 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2779                                               MachineRegisterInfo &MRI,
2780                                               MachineIRBuilder &B) const {
2781   const LLT S32 = LLT::scalar(32);
2782 
2783   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2784   Register DstReg = MI.getOperand(0).getReg();
2785   Register LHS = MI.getOperand(1).getReg();
2786   Register RHS = MI.getOperand(2).getReg();
2787 
2788   auto ThirtyOne = B.buildConstant(S32, 31);
2789   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2790   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2791 
2792   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2793   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2794 
2795   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2796   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2797 
2798   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2799   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2800 
2801   if (IsRem) {
2802     auto RSign = LHSign; // Remainder sign is the same as LHS
2803     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2804     B.buildSub(DstReg, UDivRem, RSign);
2805   } else {
2806     auto DSign = B.buildXor(S32, LHSign, RHSign);
2807     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2808     B.buildSub(DstReg, UDivRem, DSign);
2809   }
2810 
2811   MI.eraseFromParent();
2812   return true;
2813 }
2814 
2815 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2816                                             MachineRegisterInfo &MRI,
2817                                             MachineIRBuilder &B) const {
2818   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2819     return legalizeSDIV_SREM32(MI, MRI, B);
2820   return false;
2821 }
2822 
2823 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2824                                                  MachineRegisterInfo &MRI,
2825                                                  MachineIRBuilder &B) const {
2826   Register Res = MI.getOperand(0).getReg();
2827   Register LHS = MI.getOperand(1).getReg();
2828   Register RHS = MI.getOperand(2).getReg();
2829 
2830   uint16_t Flags = MI.getFlags();
2831 
2832   LLT ResTy = MRI.getType(Res);
2833   LLT S32 = LLT::scalar(32);
2834   LLT S64 = LLT::scalar(64);
2835 
2836   const MachineFunction &MF = B.getMF();
2837   bool Unsafe =
2838     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2839 
2840   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2841     return false;
2842 
2843   if (!Unsafe && ResTy == S32 &&
2844       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2845     return false;
2846 
2847   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2848     // 1 / x -> RCP(x)
2849     if (CLHS->isExactlyValue(1.0)) {
2850       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2851         .addUse(RHS)
2852         .setMIFlags(Flags);
2853 
2854       MI.eraseFromParent();
2855       return true;
2856     }
2857 
2858     // -1 / x -> RCP( FNEG(x) )
2859     if (CLHS->isExactlyValue(-1.0)) {
2860       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2861       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2862         .addUse(FNeg.getReg(0))
2863         .setMIFlags(Flags);
2864 
2865       MI.eraseFromParent();
2866       return true;
2867     }
2868   }
2869 
2870   // x / y -> x * (1.0 / y)
2871   if (Unsafe) {
2872     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2873       .addUse(RHS)
2874       .setMIFlags(Flags);
2875     B.buildFMul(Res, LHS, RCP, Flags);
2876 
2877     MI.eraseFromParent();
2878     return true;
2879   }
2880 
2881   return false;
2882 }
2883 
2884 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2885                                          MachineRegisterInfo &MRI,
2886                                          MachineIRBuilder &B) const {
2887   Register Res = MI.getOperand(0).getReg();
2888   Register LHS = MI.getOperand(1).getReg();
2889   Register RHS = MI.getOperand(2).getReg();
2890 
2891   uint16_t Flags = MI.getFlags();
2892 
2893   LLT S16 = LLT::scalar(16);
2894   LLT S32 = LLT::scalar(32);
2895 
2896   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2897   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2898 
2899   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2900     .addUse(RHSExt.getReg(0))
2901     .setMIFlags(Flags);
2902 
2903   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2904   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2905 
2906   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2907     .addUse(RDst.getReg(0))
2908     .addUse(RHS)
2909     .addUse(LHS)
2910     .setMIFlags(Flags);
2911 
2912   MI.eraseFromParent();
2913   return true;
2914 }
2915 
2916 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2917 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2918 static void toggleSPDenormMode(bool Enable,
2919                                MachineIRBuilder &B,
2920                                const GCNSubtarget &ST,
2921                                AMDGPU::SIModeRegisterDefaults Mode) {
2922   // Set SP denorm mode to this value.
2923   unsigned SPDenormMode =
2924     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2925 
2926   if (ST.hasDenormModeInst()) {
2927     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2928     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2929 
2930     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2931     B.buildInstr(AMDGPU::S_DENORM_MODE)
2932       .addImm(NewDenormModeValue);
2933 
2934   } else {
2935     // Select FP32 bit field in mode register.
2936     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2937                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2938                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2939 
2940     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2941       .addImm(SPDenormMode)
2942       .addImm(SPDenormModeBitField);
2943   }
2944 }
2945 
2946 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2947                                          MachineRegisterInfo &MRI,
2948                                          MachineIRBuilder &B) const {
2949   Register Res = MI.getOperand(0).getReg();
2950   Register LHS = MI.getOperand(1).getReg();
2951   Register RHS = MI.getOperand(2).getReg();
2952   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2953   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2954 
2955   uint16_t Flags = MI.getFlags();
2956 
2957   LLT S32 = LLT::scalar(32);
2958   LLT S1 = LLT::scalar(1);
2959 
2960   auto One = B.buildFConstant(S32, 1.0f);
2961 
2962   auto DenominatorScaled =
2963     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2964       .addUse(LHS)
2965       .addUse(RHS)
2966       .addImm(0)
2967       .setMIFlags(Flags);
2968   auto NumeratorScaled =
2969     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2970       .addUse(LHS)
2971       .addUse(RHS)
2972       .addImm(1)
2973       .setMIFlags(Flags);
2974 
2975   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2976     .addUse(DenominatorScaled.getReg(0))
2977     .setMIFlags(Flags);
2978   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2979 
2980   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2981   // aren't modeled as reading it.
2982   if (!Mode.allFP32Denormals())
2983     toggleSPDenormMode(true, B, ST, Mode);
2984 
2985   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2986   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2987   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2988   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2989   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2990   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2991 
2992   if (!Mode.allFP32Denormals())
2993     toggleSPDenormMode(false, B, ST, Mode);
2994 
2995   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2996     .addUse(Fma4.getReg(0))
2997     .addUse(Fma1.getReg(0))
2998     .addUse(Fma3.getReg(0))
2999     .addUse(NumeratorScaled.getReg(1))
3000     .setMIFlags(Flags);
3001 
3002   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3003     .addUse(Fmas.getReg(0))
3004     .addUse(RHS)
3005     .addUse(LHS)
3006     .setMIFlags(Flags);
3007 
3008   MI.eraseFromParent();
3009   return true;
3010 }
3011 
3012 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3013                                          MachineRegisterInfo &MRI,
3014                                          MachineIRBuilder &B) const {
3015   Register Res = MI.getOperand(0).getReg();
3016   Register LHS = MI.getOperand(1).getReg();
3017   Register RHS = MI.getOperand(2).getReg();
3018 
3019   uint16_t Flags = MI.getFlags();
3020 
3021   LLT S64 = LLT::scalar(64);
3022   LLT S1 = LLT::scalar(1);
3023 
3024   auto One = B.buildFConstant(S64, 1.0);
3025 
3026   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3027     .addUse(LHS)
3028     .addUse(RHS)
3029     .addImm(0)
3030     .setMIFlags(Flags);
3031 
3032   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3033 
3034   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3035     .addUse(DivScale0.getReg(0))
3036     .setMIFlags(Flags);
3037 
3038   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3039   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3040   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3041 
3042   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3043     .addUse(LHS)
3044     .addUse(RHS)
3045     .addImm(1)
3046     .setMIFlags(Flags);
3047 
3048   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3049   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3050   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3051 
3052   Register Scale;
3053   if (!ST.hasUsableDivScaleConditionOutput()) {
3054     // Workaround a hardware bug on SI where the condition output from div_scale
3055     // is not usable.
3056 
3057     LLT S32 = LLT::scalar(32);
3058 
3059     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3060     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3061     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3062     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3063 
3064     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3065                               Scale1Unmerge.getReg(1));
3066     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3067                               Scale0Unmerge.getReg(1));
3068     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3069   } else {
3070     Scale = DivScale1.getReg(1);
3071   }
3072 
3073   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3074     .addUse(Fma4.getReg(0))
3075     .addUse(Fma3.getReg(0))
3076     .addUse(Mul.getReg(0))
3077     .addUse(Scale)
3078     .setMIFlags(Flags);
3079 
3080   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3081     .addUse(Fmas.getReg(0))
3082     .addUse(RHS)
3083     .addUse(LHS)
3084     .setMIFlags(Flags);
3085 
3086   MI.eraseFromParent();
3087   return true;
3088 }
3089 
3090 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3091                                                  MachineRegisterInfo &MRI,
3092                                                  MachineIRBuilder &B) const {
3093   Register Res = MI.getOperand(0).getReg();
3094   Register LHS = MI.getOperand(2).getReg();
3095   Register RHS = MI.getOperand(3).getReg();
3096   uint16_t Flags = MI.getFlags();
3097 
3098   LLT S32 = LLT::scalar(32);
3099   LLT S1 = LLT::scalar(1);
3100 
3101   auto Abs = B.buildFAbs(S32, RHS, Flags);
3102   const APFloat C0Val(1.0f);
3103 
3104   auto C0 = B.buildConstant(S32, 0x6f800000);
3105   auto C1 = B.buildConstant(S32, 0x2f800000);
3106   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3107 
3108   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3109   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3110 
3111   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3112 
3113   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3114     .addUse(Mul0.getReg(0))
3115     .setMIFlags(Flags);
3116 
3117   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3118 
3119   B.buildFMul(Res, Sel, Mul1, Flags);
3120 
3121   MI.eraseFromParent();
3122   return true;
3123 }
3124 
3125 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3126                                                  MachineRegisterInfo &MRI,
3127                                                  MachineIRBuilder &B) const {
3128   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3129   if (!MFI->isEntryFunction()) {
3130     return legalizePreloadedArgIntrin(MI, MRI, B,
3131                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3132   }
3133 
3134   uint64_t Offset =
3135     ST.getTargetLowering()->getImplicitParameterOffset(
3136       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3137   Register DstReg = MI.getOperand(0).getReg();
3138   LLT DstTy = MRI.getType(DstReg);
3139   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3140 
3141   const ArgDescriptor *Arg;
3142   const TargetRegisterClass *RC;
3143   std::tie(Arg, RC)
3144     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3145   if (!Arg)
3146     return false;
3147 
3148   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3149   if (!loadInputValue(KernargPtrReg, B, Arg))
3150     return false;
3151 
3152   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3153   MI.eraseFromParent();
3154   return true;
3155 }
3156 
3157 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3158                                               MachineRegisterInfo &MRI,
3159                                               MachineIRBuilder &B,
3160                                               unsigned AddrSpace) const {
3161   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3162   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3163   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3164   MI.eraseFromParent();
3165   return true;
3166 }
3167 
3168 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3169 // offset (the offset that is included in bounds checking and swizzling, to be
3170 // split between the instruction's voffset and immoffset fields) and soffset
3171 // (the offset that is excluded from bounds checking and swizzling, to go in
3172 // the instruction's soffset field).  This function takes the first kind of
3173 // offset and figures out how to split it between voffset and immoffset.
3174 std::tuple<Register, unsigned, unsigned>
3175 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3176                                         Register OrigOffset) const {
3177   const unsigned MaxImm = 4095;
3178   Register BaseReg;
3179   unsigned TotalConstOffset;
3180   MachineInstr *OffsetDef;
3181   const LLT S32 = LLT::scalar(32);
3182 
3183   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3184     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3185 
3186   unsigned ImmOffset = TotalConstOffset;
3187 
3188   // If the immediate value is too big for the immoffset field, put the value
3189   // and -4096 into the immoffset field so that the value that is copied/added
3190   // for the voffset field is a multiple of 4096, and it stands more chance
3191   // of being CSEd with the copy/add for another similar load/store.
3192   // However, do not do that rounding down to a multiple of 4096 if that is a
3193   // negative number, as it appears to be illegal to have a negative offset
3194   // in the vgpr, even if adding the immediate offset makes it positive.
3195   unsigned Overflow = ImmOffset & ~MaxImm;
3196   ImmOffset -= Overflow;
3197   if ((int32_t)Overflow < 0) {
3198     Overflow += ImmOffset;
3199     ImmOffset = 0;
3200   }
3201 
3202   if (Overflow != 0) {
3203     if (!BaseReg) {
3204       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3205     } else {
3206       auto OverflowVal = B.buildConstant(S32, Overflow);
3207       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3208     }
3209   }
3210 
3211   if (!BaseReg)
3212     BaseReg = B.buildConstant(S32, 0).getReg(0);
3213 
3214   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3215 }
3216 
3217 /// Handle register layout difference for f16 images for some subtargets.
3218 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3219                                              MachineRegisterInfo &MRI,
3220                                              Register Reg) const {
3221   if (!ST.hasUnpackedD16VMem())
3222     return Reg;
3223 
3224   const LLT S16 = LLT::scalar(16);
3225   const LLT S32 = LLT::scalar(32);
3226   LLT StoreVT = MRI.getType(Reg);
3227   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3228 
3229   auto Unmerge = B.buildUnmerge(S16, Reg);
3230 
3231   SmallVector<Register, 4> WideRegs;
3232   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3233     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3234 
3235   int NumElts = StoreVT.getNumElements();
3236 
3237   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3238 }
3239 
3240 Register AMDGPULegalizerInfo::fixStoreSourceType(
3241   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3242   MachineRegisterInfo *MRI = B.getMRI();
3243   LLT Ty = MRI->getType(VData);
3244 
3245   const LLT S16 = LLT::scalar(16);
3246 
3247   // Fixup illegal register types for i8 stores.
3248   if (Ty == LLT::scalar(8) || Ty == S16) {
3249     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3250     return AnyExt;
3251   }
3252 
3253   if (Ty.isVector()) {
3254     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3255       if (IsFormat)
3256         return handleD16VData(B, *MRI, VData);
3257     }
3258   }
3259 
3260   return VData;
3261 }
3262 
3263 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3264                                               MachineRegisterInfo &MRI,
3265                                               MachineIRBuilder &B,
3266                                               bool IsTyped,
3267                                               bool IsFormat) const {
3268   Register VData = MI.getOperand(1).getReg();
3269   LLT Ty = MRI.getType(VData);
3270   LLT EltTy = Ty.getScalarType();
3271   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3272   const LLT S32 = LLT::scalar(32);
3273 
3274   VData = fixStoreSourceType(B, VData, IsFormat);
3275   Register RSrc = MI.getOperand(2).getReg();
3276 
3277   MachineMemOperand *MMO = *MI.memoperands_begin();
3278   const int MemSize = MMO->getSize();
3279 
3280   unsigned ImmOffset;
3281   unsigned TotalOffset;
3282 
3283   // The typed intrinsics add an immediate after the registers.
3284   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3285 
3286   // The struct intrinsic variants add one additional operand over raw.
3287   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3288   Register VIndex;
3289   int OpOffset = 0;
3290   if (HasVIndex) {
3291     VIndex = MI.getOperand(3).getReg();
3292     OpOffset = 1;
3293   }
3294 
3295   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3296   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3297 
3298   unsigned Format = 0;
3299   if (IsTyped) {
3300     Format = MI.getOperand(5 + OpOffset).getImm();
3301     ++OpOffset;
3302   }
3303 
3304   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3305 
3306   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3307   if (TotalOffset != 0)
3308     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3309 
3310   unsigned Opc;
3311   if (IsTyped) {
3312     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3313                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3314   } else if (IsFormat) {
3315     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3316                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3317   } else {
3318     switch (MemSize) {
3319     case 1:
3320       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3321       break;
3322     case 2:
3323       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3324       break;
3325     default:
3326       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3327       break;
3328     }
3329   }
3330 
3331   if (!VIndex)
3332     VIndex = B.buildConstant(S32, 0).getReg(0);
3333 
3334   auto MIB = B.buildInstr(Opc)
3335     .addUse(VData)              // vdata
3336     .addUse(RSrc)               // rsrc
3337     .addUse(VIndex)             // vindex
3338     .addUse(VOffset)            // voffset
3339     .addUse(SOffset)            // soffset
3340     .addImm(ImmOffset);         // offset(imm)
3341 
3342   if (IsTyped)
3343     MIB.addImm(Format);
3344 
3345   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3346      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3347      .addMemOperand(MMO);
3348 
3349   MI.eraseFromParent();
3350   return true;
3351 }
3352 
3353 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3354                                              MachineRegisterInfo &MRI,
3355                                              MachineIRBuilder &B,
3356                                              bool IsFormat,
3357                                              bool IsTyped) const {
3358   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3359   MachineMemOperand *MMO = *MI.memoperands_begin();
3360   const int MemSize = MMO->getSize();
3361   const LLT S32 = LLT::scalar(32);
3362 
3363   Register Dst = MI.getOperand(0).getReg();
3364   Register RSrc = MI.getOperand(2).getReg();
3365 
3366   // The typed intrinsics add an immediate after the registers.
3367   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3368 
3369   // The struct intrinsic variants add one additional operand over raw.
3370   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3371   Register VIndex;
3372   int OpOffset = 0;
3373   if (HasVIndex) {
3374     VIndex = MI.getOperand(3).getReg();
3375     OpOffset = 1;
3376   }
3377 
3378   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3379   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3380 
3381   unsigned Format = 0;
3382   if (IsTyped) {
3383     Format = MI.getOperand(5 + OpOffset).getImm();
3384     ++OpOffset;
3385   }
3386 
3387   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3388   unsigned ImmOffset;
3389   unsigned TotalOffset;
3390 
3391   LLT Ty = MRI.getType(Dst);
3392   LLT EltTy = Ty.getScalarType();
3393   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3394   const bool Unpacked = ST.hasUnpackedD16VMem();
3395 
3396   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3397   if (TotalOffset != 0)
3398     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3399 
3400   unsigned Opc;
3401 
3402   if (IsTyped) {
3403     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3404                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3405   } else if (IsFormat) {
3406     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3407                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3408   } else {
3409     switch (MemSize) {
3410     case 1:
3411       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3412       break;
3413     case 2:
3414       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3415       break;
3416     default:
3417       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3418       break;
3419     }
3420   }
3421 
3422   Register LoadDstReg;
3423 
3424   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3425   LLT UnpackedTy = Ty.changeElementSize(32);
3426 
3427   if (IsExtLoad)
3428     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3429   else if (Unpacked && IsD16 && Ty.isVector())
3430     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3431   else
3432     LoadDstReg = Dst;
3433 
3434   if (!VIndex)
3435     VIndex = B.buildConstant(S32, 0).getReg(0);
3436 
3437   auto MIB = B.buildInstr(Opc)
3438     .addDef(LoadDstReg)         // vdata
3439     .addUse(RSrc)               // rsrc
3440     .addUse(VIndex)             // vindex
3441     .addUse(VOffset)            // voffset
3442     .addUse(SOffset)            // soffset
3443     .addImm(ImmOffset);         // offset(imm)
3444 
3445   if (IsTyped)
3446     MIB.addImm(Format);
3447 
3448   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3449      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3450      .addMemOperand(MMO);
3451 
3452   if (LoadDstReg != Dst) {
3453     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3454 
3455     // Widen result for extending loads was widened.
3456     if (IsExtLoad)
3457       B.buildTrunc(Dst, LoadDstReg);
3458     else {
3459       // Repack to original 16-bit vector result
3460       // FIXME: G_TRUNC should work, but legalization currently fails
3461       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3462       SmallVector<Register, 4> Repack;
3463       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3464         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3465       B.buildMerge(Dst, Repack);
3466     }
3467   }
3468 
3469   MI.eraseFromParent();
3470   return true;
3471 }
3472 
3473 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3474                                                MachineIRBuilder &B,
3475                                                bool IsInc) const {
3476   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3477                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3478   B.buildInstr(Opc)
3479     .addDef(MI.getOperand(0).getReg())
3480     .addUse(MI.getOperand(2).getReg())
3481     .addUse(MI.getOperand(3).getReg())
3482     .cloneMemRefs(MI);
3483   MI.eraseFromParent();
3484   return true;
3485 }
3486 
3487 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3488   switch (IntrID) {
3489   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3490   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3491     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3492   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3493   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3494     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3495   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3496   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3497     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3498   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3499   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3500     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3501   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3502   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3503     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3504   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3505   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3506     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3507   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3508   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3509     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3510   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3511   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3512     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3513   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3514   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3515     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3516   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3517   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3518     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3519   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3520   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3521     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3522   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3523   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3524     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3525   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3526   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3527     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3528   default:
3529     llvm_unreachable("unhandled atomic opcode");
3530   }
3531 }
3532 
3533 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3534                                                MachineIRBuilder &B,
3535                                                Intrinsic::ID IID) const {
3536   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3537                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3538 
3539   Register Dst = MI.getOperand(0).getReg();
3540   Register VData = MI.getOperand(2).getReg();
3541 
3542   Register CmpVal;
3543   int OpOffset = 0;
3544 
3545   if (IsCmpSwap) {
3546     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3547     ++OpOffset;
3548   }
3549 
3550   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3551   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3552 
3553   // The struct intrinsic variants add one additional operand over raw.
3554   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3555   Register VIndex;
3556   if (HasVIndex) {
3557     VIndex = MI.getOperand(4 + OpOffset).getReg();
3558     ++OpOffset;
3559   }
3560 
3561   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3562   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3563   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3564 
3565   MachineMemOperand *MMO = *MI.memoperands_begin();
3566 
3567   unsigned ImmOffset;
3568   unsigned TotalOffset;
3569   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3570   if (TotalOffset != 0)
3571     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3572 
3573   if (!VIndex)
3574     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3575 
3576   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3577     .addDef(Dst)
3578     .addUse(VData); // vdata
3579 
3580   if (IsCmpSwap)
3581     MIB.addReg(CmpVal);
3582 
3583   MIB.addUse(RSrc)               // rsrc
3584      .addUse(VIndex)             // vindex
3585      .addUse(VOffset)            // voffset
3586      .addUse(SOffset)            // soffset
3587      .addImm(ImmOffset)          // offset(imm)
3588      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3589      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3590      .addMemOperand(MMO);
3591 
3592   MI.eraseFromParent();
3593   return true;
3594 }
3595 
3596 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3597 /// vector with s16 typed elements.
3598 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3599                                         SmallVectorImpl<Register> &PackedAddrs,
3600                                         int AddrIdx, int DimIdx, int NumVAddrs,
3601                                         int NumGradients) {
3602   const LLT S16 = LLT::scalar(16);
3603   const LLT V2S16 = LLT::vector(2, 16);
3604 
3605   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3606     MachineOperand &SrcOp = MI.getOperand(I);
3607     if (!SrcOp.isReg())
3608       continue; // _L to _LZ may have eliminated this.
3609 
3610     Register AddrReg = SrcOp.getReg();
3611 
3612     if (I < DimIdx) {
3613       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3614       PackedAddrs.push_back(AddrReg);
3615     } else {
3616       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3617       // derivatives dx/dh and dx/dv are packed with undef.
3618       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3619           ((NumGradients / 2) % 2 == 1 &&
3620            (I == DimIdx + (NumGradients / 2) - 1 ||
3621             I == DimIdx + NumGradients - 1)) ||
3622           // Check for _L to _LZ optimization
3623           !MI.getOperand(I + 1).isReg()) {
3624         PackedAddrs.push_back(
3625             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3626                 .getReg(0));
3627       } else {
3628         PackedAddrs.push_back(
3629             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3630                 .getReg(0));
3631         ++I;
3632       }
3633     }
3634   }
3635 }
3636 
3637 /// Convert from separate vaddr components to a single vector address register,
3638 /// and replace the remaining operands with $noreg.
3639 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3640                                      int DimIdx, int NumVAddrs) {
3641   const LLT S32 = LLT::scalar(32);
3642 
3643   SmallVector<Register, 8> AddrRegs;
3644   for (int I = 0; I != NumVAddrs; ++I) {
3645     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3646     if (SrcOp.isReg()) {
3647       AddrRegs.push_back(SrcOp.getReg());
3648       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3649     }
3650   }
3651 
3652   int NumAddrRegs = AddrRegs.size();
3653   if (NumAddrRegs != 1) {
3654     // Round up to 8 elements for v5-v7
3655     // FIXME: Missing intermediate sized register classes and instructions.
3656     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3657       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3658       auto Undef = B.buildUndef(S32);
3659       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3660       NumAddrRegs = RoundedNumRegs;
3661     }
3662 
3663     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3664     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3665   }
3666 
3667   for (int I = 1; I != NumVAddrs; ++I) {
3668     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3669     if (SrcOp.isReg())
3670       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3671   }
3672 }
3673 
3674 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3675 ///
3676 /// Depending on the subtarget, load/store with 16-bit element data need to be
3677 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3678 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3679 /// registers.
3680 ///
3681 /// We don't want to directly select image instructions just yet, but also want
3682 /// to exposes all register repacking to the legalizer/combiners. We also don't
3683 /// want a selected instrution entering RegBankSelect. In order to avoid
3684 /// defining a multitude of intermediate image instructions, directly hack on
3685 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3686 /// now unnecessary arguments with $noreg.
3687 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3688     MachineInstr &MI, MachineIRBuilder &B,
3689     GISelChangeObserver &Observer,
3690     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3691 
3692   const int NumDefs = MI.getNumExplicitDefs();
3693   bool IsTFE = NumDefs == 2;
3694   // We are only processing the operands of d16 image operations on subtargets
3695   // that use the unpacked register layout, or need to repack the TFE result.
3696 
3697   // TODO: Do we need to guard against already legalized intrinsics?
3698   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3699     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3700 
3701   MachineRegisterInfo *MRI = B.getMRI();
3702   const LLT S32 = LLT::scalar(32);
3703   const LLT S16 = LLT::scalar(16);
3704   const LLT V2S16 = LLT::vector(2, 16);
3705 
3706   // Index of first address argument
3707   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3708 
3709   // Check for 16 bit addresses and pack if true.
3710   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3711   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3712   const bool IsA16 = AddrTy == S16;
3713 
3714   int NumVAddrs, NumGradients;
3715   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3716   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3717     getDMaskIdx(BaseOpcode, NumDefs);
3718   unsigned DMask = 0;
3719 
3720   int DMaskLanes = 0;
3721   if (!BaseOpcode->Atomic) {
3722     DMask = MI.getOperand(DMaskIdx).getImm();
3723     if (BaseOpcode->Gather4) {
3724       DMaskLanes = 4;
3725     } else if (DMask != 0) {
3726       DMaskLanes = countPopulation(DMask);
3727     } else if (!IsTFE && !BaseOpcode->Store) {
3728       // If dmask is 0, this is a no-op load. This can be eliminated.
3729       B.buildUndef(MI.getOperand(0));
3730       MI.eraseFromParent();
3731       return true;
3732     }
3733   }
3734 
3735   Observer.changingInstr(MI);
3736   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3737 
3738   unsigned NewOpcode = NumDefs == 0 ?
3739     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3740 
3741   // Track that we legalized this
3742   MI.setDesc(B.getTII().get(NewOpcode));
3743 
3744   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3745   // dmask to be at least 1 otherwise the instruction will fail
3746   if (IsTFE && DMask == 0) {
3747     DMask = 0x1;
3748     DMaskLanes = 1;
3749     MI.getOperand(DMaskIdx).setImm(DMask);
3750   }
3751 
3752   if (BaseOpcode->Atomic) {
3753     Register VData0 = MI.getOperand(2).getReg();
3754     LLT Ty = MRI->getType(VData0);
3755 
3756     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3757     if (Ty.isVector())
3758       return false;
3759 
3760     if (BaseOpcode->AtomicX2) {
3761       Register VData1 = MI.getOperand(3).getReg();
3762       // The two values are packed in one register.
3763       LLT PackedTy = LLT::vector(2, Ty);
3764       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3765       MI.getOperand(2).setReg(Concat.getReg(0));
3766       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3767     }
3768   }
3769 
3770   int CorrectedNumVAddrs = NumVAddrs;
3771 
3772   // Optimize _L to _LZ when _L is zero
3773   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3774         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3775     const ConstantFP *ConstantLod;
3776     const int LodIdx = AddrIdx + NumVAddrs - 1;
3777 
3778     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3779       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3780         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3781         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3782           LZMappingInfo->LZ, ImageDimIntr->Dim);
3783 
3784         // The starting indexes should remain in the same place.
3785         --NumVAddrs;
3786         --CorrectedNumVAddrs;
3787 
3788         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3789           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3790         MI.RemoveOperand(LodIdx);
3791       }
3792     }
3793   }
3794 
3795   // Optimize _mip away, when 'lod' is zero
3796   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3797     int64_t ConstantLod;
3798     const int LodIdx = AddrIdx + NumVAddrs - 1;
3799 
3800     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3801       if (ConstantLod == 0) {
3802         // TODO: Change intrinsic opcode and remove operand instead or replacing
3803         // it with 0, as the _L to _LZ handling is done above.
3804         MI.getOperand(LodIdx).ChangeToImmediate(0);
3805         --CorrectedNumVAddrs;
3806       }
3807     }
3808   }
3809 
3810   // If the register allocator cannot place the address registers contiguously
3811   // without introducing moves, then using the non-sequential address encoding
3812   // is always preferable, since it saves VALU instructions and is usually a
3813   // wash in terms of code size or even better.
3814   //
3815   // However, we currently have no way of hinting to the register allocator
3816   // that MIMG addresses should be placed contiguously when it is possible to
3817   // do so, so force non-NSA for the common 2-address case as a heuristic.
3818   //
3819   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3820   // allocation when possible.
3821   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3822 
3823   // Rewrite the addressing register layout before doing anything else.
3824   if (IsA16) {
3825     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3826     // should be introduced.
3827     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3828       return false;
3829 
3830     if (NumVAddrs > 1) {
3831       SmallVector<Register, 4> PackedRegs;
3832       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3833                                   NumGradients);
3834 
3835       if (!UseNSA && PackedRegs.size() > 1) {
3836         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3837         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3838         PackedRegs[0] = Concat.getReg(0);
3839         PackedRegs.resize(1);
3840       }
3841 
3842       const int NumPacked = PackedRegs.size();
3843       for (int I = 0; I != NumVAddrs; ++I) {
3844         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3845         if (!SrcOp.isReg()) {
3846           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3847           continue;
3848         }
3849 
3850         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3851 
3852         if (I < NumPacked)
3853           SrcOp.setReg(PackedRegs[I]);
3854         else
3855           SrcOp.setReg(AMDGPU::NoRegister);
3856       }
3857     }
3858   } else if (!UseNSA && NumVAddrs > 1) {
3859     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3860   }
3861 
3862 
3863   if (BaseOpcode->Store) { // No TFE for stores?
3864     // TODO: Handle dmask trim
3865     Register VData = MI.getOperand(1).getReg();
3866     LLT Ty = MRI->getType(VData);
3867     if (!Ty.isVector() || Ty.getElementType() != S16)
3868       return true;
3869 
3870     Register RepackedReg = handleD16VData(B, *MRI, VData);
3871     if (RepackedReg != VData) {
3872       MI.getOperand(1).setReg(RepackedReg);
3873     }
3874 
3875     return true;
3876   }
3877 
3878   Register DstReg = MI.getOperand(0).getReg();
3879   LLT Ty = MRI->getType(DstReg);
3880   const LLT EltTy = Ty.getScalarType();
3881   const bool IsD16 = Ty.getScalarType() == S16;
3882   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3883 
3884   // Confirm that the return type is large enough for the dmask specified
3885   if (NumElts < DMaskLanes)
3886     return false;
3887 
3888   if (NumElts > 4 || DMaskLanes > 4)
3889     return false;
3890 
3891   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3892   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3893 
3894   // The raw dword aligned data component of the load. The only legal cases
3895   // where this matters should be when using the packed D16 format, for
3896   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3897   LLT RoundedTy;
3898 
3899   // S32 vector to to cover all data, plus TFE result element.
3900   LLT TFETy;
3901 
3902   // Register type to use for each loaded component. Will be S32 or V2S16.
3903   LLT RegTy;
3904 
3905   if (IsD16 && ST.hasUnpackedD16VMem()) {
3906     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3907     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3908     RegTy = S32;
3909   } else {
3910     unsigned EltSize = EltTy.getSizeInBits();
3911     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3912     unsigned RoundedSize = 32 * RoundedElts;
3913     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3914     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3915     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3916   }
3917 
3918   // The return type does not need adjustment.
3919   // TODO: Should we change s16 case to s32 or <2 x s16>?
3920   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3921     return true;
3922 
3923   Register Dst1Reg;
3924 
3925   // Insert after the instruction.
3926   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3927 
3928   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3929   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3930   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3931   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3932 
3933   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3934 
3935   MI.getOperand(0).setReg(NewResultReg);
3936 
3937   // In the IR, TFE is supposed to be used with a 2 element struct return
3938   // type. The intruction really returns these two values in one contiguous
3939   // register, with one additional dword beyond the loaded data. Rewrite the
3940   // return type to use a single register result.
3941 
3942   if (IsTFE) {
3943     Dst1Reg = MI.getOperand(1).getReg();
3944     if (MRI->getType(Dst1Reg) != S32)
3945       return false;
3946 
3947     // TODO: Make sure the TFE operand bit is set.
3948     MI.RemoveOperand(1);
3949 
3950     // Handle the easy case that requires no repack instructions.
3951     if (Ty == S32) {
3952       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3953       return true;
3954     }
3955   }
3956 
3957   // Now figure out how to copy the new result register back into the old
3958   // result.
3959   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3960 
3961   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3962 
3963   if (ResultNumRegs == 1) {
3964     assert(!IsTFE);
3965     ResultRegs[0] = NewResultReg;
3966   } else {
3967     // We have to repack into a new vector of some kind.
3968     for (int I = 0; I != NumDataRegs; ++I)
3969       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3970     B.buildUnmerge(ResultRegs, NewResultReg);
3971 
3972     // Drop the final TFE element to get the data part. The TFE result is
3973     // directly written to the right place already.
3974     if (IsTFE)
3975       ResultRegs.resize(NumDataRegs);
3976   }
3977 
3978   // For an s16 scalar result, we form an s32 result with a truncate regardless
3979   // of packed vs. unpacked.
3980   if (IsD16 && !Ty.isVector()) {
3981     B.buildTrunc(DstReg, ResultRegs[0]);
3982     return true;
3983   }
3984 
3985   // Avoid a build/concat_vector of 1 entry.
3986   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3987     B.buildBitcast(DstReg, ResultRegs[0]);
3988     return true;
3989   }
3990 
3991   assert(Ty.isVector());
3992 
3993   if (IsD16) {
3994     // For packed D16 results with TFE enabled, all the data components are
3995     // S32. Cast back to the expected type.
3996     //
3997     // TODO: We don't really need to use load s32 elements. We would only need one
3998     // cast for the TFE result if a multiple of v2s16 was used.
3999     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4000       for (Register &Reg : ResultRegs)
4001         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4002     } else if (ST.hasUnpackedD16VMem()) {
4003       for (Register &Reg : ResultRegs)
4004         Reg = B.buildTrunc(S16, Reg).getReg(0);
4005     }
4006   }
4007 
4008   auto padWithUndef = [&](LLT Ty, int NumElts) {
4009     if (NumElts == 0)
4010       return;
4011     Register Undef = B.buildUndef(Ty).getReg(0);
4012     for (int I = 0; I != NumElts; ++I)
4013       ResultRegs.push_back(Undef);
4014   };
4015 
4016   // Pad out any elements eliminated due to the dmask.
4017   LLT ResTy = MRI->getType(ResultRegs[0]);
4018   if (!ResTy.isVector()) {
4019     padWithUndef(ResTy, NumElts - ResultRegs.size());
4020     B.buildBuildVector(DstReg, ResultRegs);
4021     return true;
4022   }
4023 
4024   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4025   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4026 
4027   // Deal with the one annoying legal case.
4028   const LLT V3S16 = LLT::vector(3, 16);
4029   if (Ty == V3S16) {
4030     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4031     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4032     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4033     return true;
4034   }
4035 
4036   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4037   B.buildConcatVectors(DstReg, ResultRegs);
4038   return true;
4039 }
4040 
4041 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4042   MachineInstr &MI, MachineIRBuilder &B,
4043   GISelChangeObserver &Observer) const {
4044   Register Dst = MI.getOperand(0).getReg();
4045   LLT Ty = B.getMRI()->getType(Dst);
4046   unsigned Size = Ty.getSizeInBits();
4047   MachineFunction &MF = B.getMF();
4048 
4049   Observer.changingInstr(MI);
4050 
4051   // FIXME: We don't really need this intermediate instruction. The intrinsic
4052   // should be fixed to have a memory operand. Since it's readnone, we're not
4053   // allowed to add one.
4054   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4055   MI.RemoveOperand(1); // Remove intrinsic ID
4056 
4057   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4058   // TODO: Should this use datalayout alignment?
4059   const unsigned MemSize = (Size + 7) / 8;
4060   const Align MemAlign(4);
4061   MachineMemOperand *MMO = MF.getMachineMemOperand(
4062       MachinePointerInfo(),
4063       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4064           MachineMemOperand::MOInvariant,
4065       MemSize, MemAlign);
4066   MI.addMemOperand(MF, MMO);
4067 
4068   // There are no 96-bit result scalar loads, but widening to 128-bit should
4069   // always be legal. We may need to restore this to a 96-bit result if it turns
4070   // out this needs to be converted to a vector load during RegBankSelect.
4071   if (!isPowerOf2_32(Size)) {
4072     LegalizerHelper Helper(MF, *this, Observer, B);
4073 
4074     if (Ty.isVector())
4075       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4076     else
4077       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4078   }
4079 
4080   Observer.changedInstr(MI);
4081   return true;
4082 }
4083 
4084 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4085                                                 MachineRegisterInfo &MRI,
4086                                                 MachineIRBuilder &B) const {
4087   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4088   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4089       !ST.isTrapHandlerEnabled()) {
4090     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4091   } else {
4092     // Pass queue pointer to trap handler as input, and insert trap instruction
4093     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4094     const ArgDescriptor *Arg =
4095         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4096     if (!Arg)
4097       return false;
4098     MachineRegisterInfo &MRI = *B.getMRI();
4099     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4100     Register LiveIn = getLiveInRegister(
4101         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4102         /*InsertLiveInCopy=*/false);
4103     if (!loadInputValue(LiveIn, B, Arg))
4104       return false;
4105     B.buildCopy(SGPR01, LiveIn);
4106     B.buildInstr(AMDGPU::S_TRAP)
4107         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4108         .addReg(SGPR01, RegState::Implicit);
4109   }
4110 
4111   MI.eraseFromParent();
4112   return true;
4113 }
4114 
4115 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4116     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4117   // Is non-HSA path or trap-handler disabled? then, report a warning
4118   // accordingly
4119   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4120       !ST.isTrapHandlerEnabled()) {
4121     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4122                                      "debugtrap handler not supported",
4123                                      MI.getDebugLoc(), DS_Warning);
4124     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4125     Ctx.diagnose(NoTrap);
4126   } else {
4127     // Insert debug-trap instruction
4128     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4129   }
4130 
4131   MI.eraseFromParent();
4132   return true;
4133 }
4134 
4135 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4136                                             MachineIRBuilder &B,
4137                                             GISelChangeObserver &Observer) const {
4138   MachineRegisterInfo &MRI = *B.getMRI();
4139 
4140   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4141   auto IntrID = MI.getIntrinsicID();
4142   switch (IntrID) {
4143   case Intrinsic::amdgcn_if:
4144   case Intrinsic::amdgcn_else: {
4145     MachineInstr *Br = nullptr;
4146     MachineBasicBlock *UncondBrTarget = nullptr;
4147     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4148       const SIRegisterInfo *TRI
4149         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4150 
4151       Register Def = MI.getOperand(1).getReg();
4152       Register Use = MI.getOperand(3).getReg();
4153 
4154       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4155       if (IntrID == Intrinsic::amdgcn_if) {
4156         B.buildInstr(AMDGPU::SI_IF)
4157           .addDef(Def)
4158           .addUse(Use)
4159           .addMBB(UncondBrTarget);
4160       } else {
4161         B.buildInstr(AMDGPU::SI_ELSE)
4162           .addDef(Def)
4163           .addUse(Use)
4164           .addMBB(UncondBrTarget)
4165           .addImm(0);
4166       }
4167 
4168       if (Br) {
4169         Br->getOperand(0).setMBB(CondBrTarget);
4170       } else {
4171         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4172         // since we're swapping branch targets it needs to be reinserted.
4173         // FIXME: IRTranslator should probably not do this
4174         B.buildBr(*CondBrTarget);
4175       }
4176 
4177       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4178       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4179       MI.eraseFromParent();
4180       BrCond->eraseFromParent();
4181       return true;
4182     }
4183 
4184     return false;
4185   }
4186   case Intrinsic::amdgcn_loop: {
4187     MachineInstr *Br = nullptr;
4188     MachineBasicBlock *UncondBrTarget = nullptr;
4189     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4190       const SIRegisterInfo *TRI
4191         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4192 
4193       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4194       Register Reg = MI.getOperand(2).getReg();
4195       B.buildInstr(AMDGPU::SI_LOOP)
4196         .addUse(Reg)
4197         .addMBB(UncondBrTarget);
4198 
4199       if (Br)
4200         Br->getOperand(0).setMBB(CondBrTarget);
4201       else
4202         B.buildBr(*CondBrTarget);
4203 
4204       MI.eraseFromParent();
4205       BrCond->eraseFromParent();
4206       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4207       return true;
4208     }
4209 
4210     return false;
4211   }
4212   case Intrinsic::amdgcn_kernarg_segment_ptr:
4213     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4214       // This only makes sense to call in a kernel, so just lower to null.
4215       B.buildConstant(MI.getOperand(0).getReg(), 0);
4216       MI.eraseFromParent();
4217       return true;
4218     }
4219 
4220     return legalizePreloadedArgIntrin(
4221       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4222   case Intrinsic::amdgcn_implicitarg_ptr:
4223     return legalizeImplicitArgPtr(MI, MRI, B);
4224   case Intrinsic::amdgcn_workitem_id_x:
4225     return legalizePreloadedArgIntrin(MI, MRI, B,
4226                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4227   case Intrinsic::amdgcn_workitem_id_y:
4228     return legalizePreloadedArgIntrin(MI, MRI, B,
4229                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4230   case Intrinsic::amdgcn_workitem_id_z:
4231     return legalizePreloadedArgIntrin(MI, MRI, B,
4232                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4233   case Intrinsic::amdgcn_workgroup_id_x:
4234     return legalizePreloadedArgIntrin(MI, MRI, B,
4235                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4236   case Intrinsic::amdgcn_workgroup_id_y:
4237     return legalizePreloadedArgIntrin(MI, MRI, B,
4238                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4239   case Intrinsic::amdgcn_workgroup_id_z:
4240     return legalizePreloadedArgIntrin(MI, MRI, B,
4241                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4242   case Intrinsic::amdgcn_dispatch_ptr:
4243     return legalizePreloadedArgIntrin(MI, MRI, B,
4244                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4245   case Intrinsic::amdgcn_queue_ptr:
4246     return legalizePreloadedArgIntrin(MI, MRI, B,
4247                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4248   case Intrinsic::amdgcn_implicit_buffer_ptr:
4249     return legalizePreloadedArgIntrin(
4250       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4251   case Intrinsic::amdgcn_dispatch_id:
4252     return legalizePreloadedArgIntrin(MI, MRI, B,
4253                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4254   case Intrinsic::amdgcn_fdiv_fast:
4255     return legalizeFDIVFastIntrin(MI, MRI, B);
4256   case Intrinsic::amdgcn_is_shared:
4257     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4258   case Intrinsic::amdgcn_is_private:
4259     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4260   case Intrinsic::amdgcn_wavefrontsize: {
4261     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4262     MI.eraseFromParent();
4263     return true;
4264   }
4265   case Intrinsic::amdgcn_s_buffer_load:
4266     return legalizeSBufferLoad(MI, B, Observer);
4267   case Intrinsic::amdgcn_raw_buffer_store:
4268   case Intrinsic::amdgcn_struct_buffer_store:
4269     return legalizeBufferStore(MI, MRI, B, false, false);
4270   case Intrinsic::amdgcn_raw_buffer_store_format:
4271   case Intrinsic::amdgcn_struct_buffer_store_format:
4272     return legalizeBufferStore(MI, MRI, B, false, true);
4273   case Intrinsic::amdgcn_raw_tbuffer_store:
4274   case Intrinsic::amdgcn_struct_tbuffer_store:
4275     return legalizeBufferStore(MI, MRI, B, true, true);
4276   case Intrinsic::amdgcn_raw_buffer_load:
4277   case Intrinsic::amdgcn_struct_buffer_load:
4278     return legalizeBufferLoad(MI, MRI, B, false, false);
4279   case Intrinsic::amdgcn_raw_buffer_load_format:
4280   case Intrinsic::amdgcn_struct_buffer_load_format:
4281     return legalizeBufferLoad(MI, MRI, B, true, false);
4282   case Intrinsic::amdgcn_raw_tbuffer_load:
4283   case Intrinsic::amdgcn_struct_tbuffer_load:
4284     return legalizeBufferLoad(MI, MRI, B, true, true);
4285   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4286   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4287   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4288   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4289   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4290   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4291   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4292   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4293   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4294   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4295   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4296   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4297   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4298   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4299   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4300   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4301   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4302   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4303   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4304   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4305   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4306   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4307   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4308   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4309   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4310   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4311     return legalizeBufferAtomic(MI, B, IntrID);
4312   case Intrinsic::amdgcn_atomic_inc:
4313     return legalizeAtomicIncDec(MI, B, true);
4314   case Intrinsic::amdgcn_atomic_dec:
4315     return legalizeAtomicIncDec(MI, B, false);
4316   case Intrinsic::trap:
4317     return legalizeTrapIntrinsic(MI, MRI, B);
4318   case Intrinsic::debugtrap:
4319     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4320   default: {
4321     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4322             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4323       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4324     return true;
4325   }
4326   }
4327 
4328   return true;
4329 }
4330