1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .clampScalar(0, S32, S256)
419     .widenScalarToNextPow2(0, 32)
420     .clampMaxNumElements(0, S32, 16)
421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422     .legalIf(isPointer(0));
423 
424   if (ST.hasVOP3PInsts()) {
425     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426       .legalFor({S32, S16, V2S16})
427       .clampScalar(0, S16, S32)
428       .clampMaxNumElements(0, S16, 2)
429       .scalarize(0)
430       .widenScalarToNextPow2(0, 32);
431   } else if (ST.has16BitInsts()) {
432     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
433       .legalFor({S32, S16})
434       .clampScalar(0, S16, S32)
435       .scalarize(0)
436       .widenScalarToNextPow2(0, 32);
437   } else {
438     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
439       .legalFor({S32})
440       .clampScalar(0, S32, S32)
441       .scalarize(0);
442   }
443 
444   // FIXME: Not really legal. Placeholder for custom lowering.
445   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
446     .customFor({S32, S64})
447     .clampScalar(0, S32, S64)
448     .widenScalarToNextPow2(0, 32)
449     .scalarize(0);
450 
451   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
452     .legalFor({S32})
453     .clampScalar(0, S32, S32)
454     .scalarize(0);
455 
456   // Report legal for any types we can handle anywhere. For the cases only legal
457   // on the SALU, RegBankSelect will be able to re-legalize.
458   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
459     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
460     .clampScalar(0, S32, S64)
461     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
462     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
463     .widenScalarToNextPow2(0)
464     .scalarize(0);
465 
466   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
467                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
468     .legalFor({{S32, S1}, {S32, S32}})
469     .minScalar(0, S32)
470     // TODO: .scalarize(0)
471     .lower();
472 
473   getActionDefinitionsBuilder(G_BITCAST)
474     // Don't worry about the size constraint.
475     .legalIf(all(isRegisterType(0), isRegisterType(1)))
476     .lower();
477 
478 
479   getActionDefinitionsBuilder(G_CONSTANT)
480     .legalFor({S1, S32, S64, S16, GlobalPtr,
481                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
482     .clampScalar(0, S32, S64)
483     .widenScalarToNextPow2(0)
484     .legalIf(isPointer(0));
485 
486   getActionDefinitionsBuilder(G_FCONSTANT)
487     .legalFor({S32, S64, S16})
488     .clampScalar(0, S16, S64);
489 
490   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
491       .legalIf(isRegisterType(0))
492       // s1 and s16 are special cases because they have legal operations on
493       // them, but don't really occupy registers in the normal way.
494       .legalFor({S1, S16})
495       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
496       .clampScalarOrElt(0, S32, MaxScalar)
497       .widenScalarToNextPow2(0, 32)
498       .clampMaxNumElements(0, S32, 16);
499 
500   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
501 
502   // If the amount is divergent, we have to do a wave reduction to get the
503   // maximum value, so this is expanded during RegBankSelect.
504   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
505     .legalFor({{PrivatePtr, S32}});
506 
507   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
508     .unsupportedFor({PrivatePtr})
509     .custom();
510   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
511 
512   auto &FPOpActions = getActionDefinitionsBuilder(
513     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
514     .legalFor({S32, S64});
515   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
516     .customFor({S32, S64});
517   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
518     .customFor({S32, S64});
519 
520   if (ST.has16BitInsts()) {
521     if (ST.hasVOP3PInsts())
522       FPOpActions.legalFor({S16, V2S16});
523     else
524       FPOpActions.legalFor({S16});
525 
526     TrigActions.customFor({S16});
527     FDIVActions.customFor({S16});
528   }
529 
530   auto &MinNumMaxNum = getActionDefinitionsBuilder({
531       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
532 
533   if (ST.hasVOP3PInsts()) {
534     MinNumMaxNum.customFor(FPTypesPK16)
535       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
536       .clampMaxNumElements(0, S16, 2)
537       .clampScalar(0, S16, S64)
538       .scalarize(0);
539   } else if (ST.has16BitInsts()) {
540     MinNumMaxNum.customFor(FPTypes16)
541       .clampScalar(0, S16, S64)
542       .scalarize(0);
543   } else {
544     MinNumMaxNum.customFor(FPTypesBase)
545       .clampScalar(0, S32, S64)
546       .scalarize(0);
547   }
548 
549   if (ST.hasVOP3PInsts())
550     FPOpActions.clampMaxNumElements(0, S16, 2);
551 
552   FPOpActions
553     .scalarize(0)
554     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
555 
556   TrigActions
557     .scalarize(0)
558     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
559 
560   FDIVActions
561     .scalarize(0)
562     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
563 
564   getActionDefinitionsBuilder({G_FNEG, G_FABS})
565     .legalFor(FPTypesPK16)
566     .clampMaxNumElements(0, S16, 2)
567     .scalarize(0)
568     .clampScalar(0, S16, S64);
569 
570   if (ST.has16BitInsts()) {
571     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
572       .legalFor({S32, S64, S16})
573       .scalarize(0)
574       .clampScalar(0, S16, S64);
575   } else {
576     getActionDefinitionsBuilder(G_FSQRT)
577       .legalFor({S32, S64})
578       .scalarize(0)
579       .clampScalar(0, S32, S64);
580 
581     if (ST.hasFractBug()) {
582       getActionDefinitionsBuilder(G_FFLOOR)
583         .customFor({S64})
584         .legalFor({S32, S64})
585         .scalarize(0)
586         .clampScalar(0, S32, S64);
587     } else {
588       getActionDefinitionsBuilder(G_FFLOOR)
589         .legalFor({S32, S64})
590         .scalarize(0)
591         .clampScalar(0, S32, S64);
592     }
593   }
594 
595   getActionDefinitionsBuilder(G_FPTRUNC)
596     .legalFor({{S32, S64}, {S16, S32}})
597     .scalarize(0)
598     .lower();
599 
600   getActionDefinitionsBuilder(G_FPEXT)
601     .legalFor({{S64, S32}, {S32, S16}})
602     .lowerFor({{S64, S16}}) // FIXME: Implement
603     .scalarize(0);
604 
605   getActionDefinitionsBuilder(G_FSUB)
606       // Use actual fsub instruction
607       .legalFor({S32})
608       // Must use fadd + fneg
609       .lowerFor({S64, S16, V2S16})
610       .scalarize(0)
611       .clampScalar(0, S32, S64);
612 
613   // Whether this is legal depends on the floating point mode for the function.
614   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
615   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
616     FMad.customFor({S32, S16});
617   else if (ST.hasMadMacF32Insts())
618     FMad.customFor({S32});
619   else if (ST.hasMadF16())
620     FMad.customFor({S16});
621   FMad.scalarize(0)
622       .lower();
623 
624   // TODO: Do we need to clamp maximum bitwidth?
625   getActionDefinitionsBuilder(G_TRUNC)
626     .legalIf(isScalar(0))
627     .legalFor({{V2S16, V2S32}})
628     .clampMaxNumElements(0, S16, 2)
629     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
630     // situations (like an invalid implicit use), we don't want to infinite loop
631     // in the legalizer.
632     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
633     .alwaysLegal();
634 
635   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
636     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
637                {S32, S1}, {S64, S1}, {S16, S1}})
638     .scalarize(0)
639     .clampScalar(0, S32, S64)
640     .widenScalarToNextPow2(1, 32);
641 
642   // TODO: Split s1->s64 during regbankselect for VALU.
643   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
644     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
645     .lowerFor({{S32, S64}})
646     .lowerIf(typeIs(1, S1))
647     .customFor({{S64, S64}});
648   if (ST.has16BitInsts())
649     IToFP.legalFor({{S16, S16}});
650   IToFP.clampScalar(1, S32, S64)
651        .minScalar(0, S32)
652        .scalarize(0)
653        .widenScalarToNextPow2(1);
654 
655   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
656     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
657     .customFor({{S64, S64}});
658   if (ST.has16BitInsts())
659     FPToI.legalFor({{S16, S16}});
660   else
661     FPToI.minScalar(1, S32);
662 
663   FPToI.minScalar(0, S32)
664        .scalarize(0)
665        .lower();
666 
667   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
668     .scalarize(0)
669     .lower();
670 
671   if (ST.has16BitInsts()) {
672     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
673       .legalFor({S16, S32, S64})
674       .clampScalar(0, S16, S64)
675       .scalarize(0);
676   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
677     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
678       .legalFor({S32, S64})
679       .clampScalar(0, S32, S64)
680       .scalarize(0);
681   } else {
682     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
683       .legalFor({S32})
684       .customFor({S64})
685       .clampScalar(0, S32, S64)
686       .scalarize(0);
687   }
688 
689   // FIXME: Clamp offset operand.
690   getActionDefinitionsBuilder(G_PTR_ADD)
691     .legalIf(isPointer(0))
692     .scalarize(0);
693 
694   getActionDefinitionsBuilder(G_PTRMASK)
695     .legalIf(typeInSet(1, {S64, S32}))
696     .minScalar(1, S32)
697     .maxScalarIf(sizeIs(0, 32), 1, S32)
698     .maxScalarIf(sizeIs(0, 64), 1, S64)
699     .scalarize(0);
700 
701   auto &CmpBuilder =
702     getActionDefinitionsBuilder(G_ICMP)
703     // The compare output type differs based on the register bank of the output,
704     // so make both s1 and s32 legal.
705     //
706     // Scalar compares producing output in scc will be promoted to s32, as that
707     // is the allocatable register type that will be needed for the copy from
708     // scc. This will be promoted during RegBankSelect, and we assume something
709     // before that won't try to use s32 result types.
710     //
711     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
712     // bank.
713     .legalForCartesianProduct(
714       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
715     .legalForCartesianProduct(
716       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
717   if (ST.has16BitInsts()) {
718     CmpBuilder.legalFor({{S1, S16}});
719   }
720 
721   CmpBuilder
722     .widenScalarToNextPow2(1)
723     .clampScalar(1, S32, S64)
724     .scalarize(0)
725     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
726 
727   getActionDefinitionsBuilder(G_FCMP)
728     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
729     .widenScalarToNextPow2(1)
730     .clampScalar(1, S32, S64)
731     .scalarize(0);
732 
733   // FIXME: fpow has a selection pattern that should move to custom lowering.
734   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
735   if (ST.has16BitInsts())
736     Exp2Ops.legalFor({S32, S16});
737   else
738     Exp2Ops.legalFor({S32});
739   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
740   Exp2Ops.scalarize(0);
741 
742   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
743   if (ST.has16BitInsts())
744     ExpOps.customFor({{S32}, {S16}});
745   else
746     ExpOps.customFor({S32});
747   ExpOps.clampScalar(0, MinScalarFPTy, S32)
748         .scalarize(0);
749 
750   // The 64-bit versions produce 32-bit results, but only on the SALU.
751   getActionDefinitionsBuilder(G_CTPOP)
752     .legalFor({{S32, S32}, {S32, S64}})
753     .clampScalar(0, S32, S32)
754     .clampScalar(1, S32, S64)
755     .scalarize(0)
756     .widenScalarToNextPow2(0, 32)
757     .widenScalarToNextPow2(1, 32);
758 
759   // The hardware instructions return a different result on 0 than the generic
760   // instructions expect. The hardware produces -1, but these produce the
761   // bitwidth.
762   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
763     .scalarize(0)
764     .clampScalar(0, S32, S32)
765     .clampScalar(1, S32, S64)
766     .widenScalarToNextPow2(0, 32)
767     .widenScalarToNextPow2(1, 32)
768     .lower();
769 
770   // The 64-bit versions produce 32-bit results, but only on the SALU.
771   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
772     .legalFor({{S32, S32}, {S32, S64}})
773     .clampScalar(0, S32, S32)
774     .clampScalar(1, S32, S64)
775     .scalarize(0)
776     .widenScalarToNextPow2(0, 32)
777     .widenScalarToNextPow2(1, 32);
778 
779   getActionDefinitionsBuilder(G_BITREVERSE)
780     .legalFor({S32})
781     .clampScalar(0, S32, S32)
782     .scalarize(0);
783 
784   if (ST.has16BitInsts()) {
785     getActionDefinitionsBuilder(G_BSWAP)
786       .legalFor({S16, S32, V2S16})
787       .clampMaxNumElements(0, S16, 2)
788       // FIXME: Fixing non-power-of-2 before clamp is workaround for
789       // narrowScalar limitation.
790       .widenScalarToNextPow2(0)
791       .clampScalar(0, S16, S32)
792       .scalarize(0);
793 
794     if (ST.hasVOP3PInsts()) {
795       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
796         .legalFor({S32, S16, V2S16})
797         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
798         .clampMaxNumElements(0, S16, 2)
799         .minScalar(0, S16)
800         .widenScalarToNextPow2(0)
801         .scalarize(0)
802         .lower();
803     } else {
804       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
805         .legalFor({S32, S16})
806         .widenScalarToNextPow2(0)
807         .minScalar(0, S16)
808         .scalarize(0)
809         .lower();
810     }
811   } else {
812     // TODO: Should have same legality without v_perm_b32
813     getActionDefinitionsBuilder(G_BSWAP)
814       .legalFor({S32})
815       .lowerIf(scalarNarrowerThan(0, 32))
816       // FIXME: Fixing non-power-of-2 before clamp is workaround for
817       // narrowScalar limitation.
818       .widenScalarToNextPow2(0)
819       .maxScalar(0, S32)
820       .scalarize(0)
821       .lower();
822 
823     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
824       .legalFor({S32})
825       .minScalar(0, S32)
826       .widenScalarToNextPow2(0)
827       .scalarize(0)
828       .lower();
829   }
830 
831   getActionDefinitionsBuilder(G_INTTOPTR)
832     // List the common cases
833     .legalForCartesianProduct(AddrSpaces64, {S64})
834     .legalForCartesianProduct(AddrSpaces32, {S32})
835     .scalarize(0)
836     // Accept any address space as long as the size matches
837     .legalIf(sameSize(0, 1))
838     .widenScalarIf(smallerThan(1, 0),
839       [](const LegalityQuery &Query) {
840         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
841       })
842     .narrowScalarIf(largerThan(1, 0),
843       [](const LegalityQuery &Query) {
844         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
845       });
846 
847   getActionDefinitionsBuilder(G_PTRTOINT)
848     // List the common cases
849     .legalForCartesianProduct(AddrSpaces64, {S64})
850     .legalForCartesianProduct(AddrSpaces32, {S32})
851     .scalarize(0)
852     // Accept any address space as long as the size matches
853     .legalIf(sameSize(0, 1))
854     .widenScalarIf(smallerThan(0, 1),
855       [](const LegalityQuery &Query) {
856         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
857       })
858     .narrowScalarIf(
859       largerThan(0, 1),
860       [](const LegalityQuery &Query) {
861         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
862       });
863 
864   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
865     .scalarize(0)
866     .custom();
867 
868   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
869                                     bool IsLoad) -> bool {
870     const LLT DstTy = Query.Types[0];
871 
872     // Split vector extloads.
873     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
874     unsigned Align = Query.MMODescrs[0].AlignInBits;
875 
876     if (MemSize < DstTy.getSizeInBits())
877       MemSize = std::max(MemSize, Align);
878 
879     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
880       return true;
881 
882     const LLT PtrTy = Query.Types[1];
883     unsigned AS = PtrTy.getAddressSpace();
884     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
885       return true;
886 
887     // Catch weird sized loads that don't evenly divide into the access sizes
888     // TODO: May be able to widen depending on alignment etc.
889     unsigned NumRegs = (MemSize + 31) / 32;
890     if (NumRegs == 3) {
891       if (!ST.hasDwordx3LoadStores())
892         return true;
893     } else {
894       // If the alignment allows, these should have been widened.
895       if (!isPowerOf2_32(NumRegs))
896         return true;
897     }
898 
899     if (Align < MemSize) {
900       const SITargetLowering *TLI = ST.getTargetLowering();
901       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
902     }
903 
904     return false;
905   };
906 
907   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
908                                          unsigned Opc) -> bool {
909     unsigned Size = Query.Types[0].getSizeInBits();
910     if (isPowerOf2_32(Size))
911       return false;
912 
913     if (Size == 96 && ST.hasDwordx3LoadStores())
914       return false;
915 
916     unsigned AddrSpace = Query.Types[1].getAddressSpace();
917     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
918       return false;
919 
920     unsigned Align = Query.MMODescrs[0].AlignInBits;
921     unsigned RoundedSize = NextPowerOf2(Size);
922     return (Align >= RoundedSize);
923   };
924 
925   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
926   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
927   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
928 
929   // TODO: Refine based on subtargets which support unaligned access or 128-bit
930   // LDS
931   // TODO: Unsupported flat for SI.
932 
933   for (unsigned Op : {G_LOAD, G_STORE}) {
934     const bool IsStore = Op == G_STORE;
935 
936     auto &Actions = getActionDefinitionsBuilder(Op);
937     // Explicitly list some common cases.
938     // TODO: Does this help compile time at all?
939     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
940                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
941                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
942                                       {S64, GlobalPtr, 64, GlobalAlign32},
943                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
944                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
945                                       {S32, GlobalPtr, 8, GlobalAlign8},
946                                       {S32, GlobalPtr, 16, GlobalAlign16},
947 
948                                       {S32, LocalPtr, 32, 32},
949                                       {S64, LocalPtr, 64, 32},
950                                       {V2S32, LocalPtr, 64, 32},
951                                       {S32, LocalPtr, 8, 8},
952                                       {S32, LocalPtr, 16, 16},
953                                       {V2S16, LocalPtr, 32, 32},
954 
955                                       {S32, PrivatePtr, 32, 32},
956                                       {S32, PrivatePtr, 8, 8},
957                                       {S32, PrivatePtr, 16, 16},
958                                       {V2S16, PrivatePtr, 32, 32},
959 
960                                       {S32, ConstantPtr, 32, GlobalAlign32},
961                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
962                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
963                                       {S64, ConstantPtr, 64, GlobalAlign32},
964                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
965     Actions.legalIf(
966       [=](const LegalityQuery &Query) -> bool {
967         return isLoadStoreLegal(ST, Query, Op);
968       });
969 
970     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
971     // 64-bits.
972     //
973     // TODO: Should generalize bitcast action into coerce, which will also cover
974     // inserting addrspacecasts.
975     Actions.customIf(typeIs(1, Constant32Ptr));
976 
977     // Turn any illegal element vectors into something easier to deal
978     // with. These will ultimately produce 32-bit scalar shifts to extract the
979     // parts anyway.
980     //
981     // For odd 16-bit element vectors, prefer to split those into pieces with
982     // 16-bit vector parts.
983     Actions.bitcastIf(
984       [=](const LegalityQuery &Query) -> bool {
985         const LLT Ty = Query.Types[0];
986         const unsigned Size = Ty.getSizeInBits();
987 
988         if (Size != Query.MMODescrs[0].SizeInBits)
989           return Size <= 32 && Ty.isVector();
990 
991         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
992           return true;
993         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
994                !isRegisterVectorElementType(Ty.getElementType());
995       }, bitcastToRegisterType(0));
996 
997     Actions
998         .customIf(typeIs(1, Constant32Ptr))
999         // Widen suitably aligned loads by loading extra elements.
1000         .moreElementsIf([=](const LegalityQuery &Query) {
1001             const LLT Ty = Query.Types[0];
1002             return Op == G_LOAD && Ty.isVector() &&
1003                    shouldWidenLoadResult(Query, Op);
1004           }, moreElementsToNextPow2(0))
1005         .widenScalarIf([=](const LegalityQuery &Query) {
1006             const LLT Ty = Query.Types[0];
1007             return Op == G_LOAD && !Ty.isVector() &&
1008                    shouldWidenLoadResult(Query, Op);
1009           }, widenScalarOrEltToNextPow2(0))
1010         .narrowScalarIf(
1011             [=](const LegalityQuery &Query) -> bool {
1012               return !Query.Types[0].isVector() &&
1013                      needToSplitMemOp(Query, Op == G_LOAD);
1014             },
1015             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1016               const LLT DstTy = Query.Types[0];
1017               const LLT PtrTy = Query.Types[1];
1018 
1019               const unsigned DstSize = DstTy.getSizeInBits();
1020               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1021 
1022               // Split extloads.
1023               if (DstSize > MemSize)
1024                 return std::make_pair(0, LLT::scalar(MemSize));
1025 
1026               if (!isPowerOf2_32(DstSize)) {
1027                 // We're probably decomposing an odd sized store. Try to split
1028                 // to the widest type. TODO: Account for alignment. As-is it
1029                 // should be OK, since the new parts will be further legalized.
1030                 unsigned FloorSize = PowerOf2Floor(DstSize);
1031                 return std::make_pair(0, LLT::scalar(FloorSize));
1032               }
1033 
1034               if (DstSize > 32 && (DstSize % 32 != 0)) {
1035                 // FIXME: Need a way to specify non-extload of larger size if
1036                 // suitably aligned.
1037                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1038               }
1039 
1040               unsigned MaxSize = maxSizeForAddrSpace(ST,
1041                                                      PtrTy.getAddressSpace(),
1042                                                      Op == G_LOAD);
1043               if (MemSize > MaxSize)
1044                 return std::make_pair(0, LLT::scalar(MaxSize));
1045 
1046               unsigned Align = Query.MMODescrs[0].AlignInBits;
1047               return std::make_pair(0, LLT::scalar(Align));
1048             })
1049         .fewerElementsIf(
1050             [=](const LegalityQuery &Query) -> bool {
1051               return Query.Types[0].isVector() &&
1052                      needToSplitMemOp(Query, Op == G_LOAD);
1053             },
1054             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1055               const LLT DstTy = Query.Types[0];
1056               const LLT PtrTy = Query.Types[1];
1057 
1058               LLT EltTy = DstTy.getElementType();
1059               unsigned MaxSize = maxSizeForAddrSpace(ST,
1060                                                      PtrTy.getAddressSpace(),
1061                                                      Op == G_LOAD);
1062 
1063               // FIXME: Handle widened to power of 2 results better. This ends
1064               // up scalarizing.
1065               // FIXME: 3 element stores scalarized on SI
1066 
1067               // Split if it's too large for the address space.
1068               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1069                 unsigned NumElts = DstTy.getNumElements();
1070                 unsigned EltSize = EltTy.getSizeInBits();
1071 
1072                 if (MaxSize % EltSize == 0) {
1073                   return std::make_pair(
1074                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1075                 }
1076 
1077                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1078 
1079                 // FIXME: Refine when odd breakdowns handled
1080                 // The scalars will need to be re-legalized.
1081                 if (NumPieces == 1 || NumPieces >= NumElts ||
1082                     NumElts % NumPieces != 0)
1083                   return std::make_pair(0, EltTy);
1084 
1085                 return std::make_pair(0,
1086                                       LLT::vector(NumElts / NumPieces, EltTy));
1087               }
1088 
1089               // FIXME: We could probably handle weird extending loads better.
1090               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1091               if (DstTy.getSizeInBits() > MemSize)
1092                 return std::make_pair(0, EltTy);
1093 
1094               unsigned EltSize = EltTy.getSizeInBits();
1095               unsigned DstSize = DstTy.getSizeInBits();
1096               if (!isPowerOf2_32(DstSize)) {
1097                 // We're probably decomposing an odd sized store. Try to split
1098                 // to the widest type. TODO: Account for alignment. As-is it
1099                 // should be OK, since the new parts will be further legalized.
1100                 unsigned FloorSize = PowerOf2Floor(DstSize);
1101                 return std::make_pair(
1102                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1103               }
1104 
1105               // Need to split because of alignment.
1106               unsigned Align = Query.MMODescrs[0].AlignInBits;
1107               if (EltSize > Align &&
1108                   (EltSize / Align < DstTy.getNumElements())) {
1109                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1110               }
1111 
1112               // May need relegalization for the scalars.
1113               return std::make_pair(0, EltTy);
1114             })
1115         .minScalar(0, S32);
1116 
1117     if (IsStore)
1118       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1119 
1120     // TODO: Need a bitcast lower option?
1121     Actions
1122         .widenScalarToNextPow2(0)
1123         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1124   }
1125 
1126   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1127                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1128                                                   {S32, GlobalPtr, 16, 2 * 8},
1129                                                   {S32, LocalPtr, 8, 8},
1130                                                   {S32, LocalPtr, 16, 16},
1131                                                   {S32, PrivatePtr, 8, 8},
1132                                                   {S32, PrivatePtr, 16, 16},
1133                                                   {S32, ConstantPtr, 8, 8},
1134                                                   {S32, ConstantPtr, 16, 2 * 8}});
1135   if (ST.hasFlatAddressSpace()) {
1136     ExtLoads.legalForTypesWithMemDesc(
1137         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1138   }
1139 
1140   ExtLoads.clampScalar(0, S32, S32)
1141           .widenScalarToNextPow2(0)
1142           .unsupportedIfMemSizeNotPow2()
1143           .lower();
1144 
1145   auto &Atomics = getActionDefinitionsBuilder(
1146     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1147      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1148      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1149      G_ATOMICRMW_UMIN})
1150     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1151                {S64, GlobalPtr}, {S64, LocalPtr}});
1152   if (ST.hasFlatAddressSpace()) {
1153     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1154   }
1155 
1156   if (ST.hasLDSFPAtomics()) {
1157     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1158       .legalFor({{S32, LocalPtr}});
1159   }
1160 
1161   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1162   // demarshalling
1163   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1164     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1165                 {S32, FlatPtr}, {S64, FlatPtr}})
1166     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1167                {S32, RegionPtr}, {S64, RegionPtr}});
1168   // TODO: Pointer types, any 32-bit or 64-bit vector
1169 
1170   // Condition should be s32 for scalar, s1 for vector.
1171   getActionDefinitionsBuilder(G_SELECT)
1172     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1173           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1174           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1175     .clampScalar(0, S16, S64)
1176     .scalarize(1)
1177     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1178     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1179     .clampMaxNumElements(0, S32, 2)
1180     .clampMaxNumElements(0, LocalPtr, 2)
1181     .clampMaxNumElements(0, PrivatePtr, 2)
1182     .scalarize(0)
1183     .widenScalarToNextPow2(0)
1184     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1185 
1186   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1187   // be more flexible with the shift amount type.
1188   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1189     .legalFor({{S32, S32}, {S64, S32}});
1190   if (ST.has16BitInsts()) {
1191     if (ST.hasVOP3PInsts()) {
1192       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1193             .clampMaxNumElements(0, S16, 2);
1194     } else
1195       Shifts.legalFor({{S16, S16}});
1196 
1197     // TODO: Support 16-bit shift amounts for all types
1198     Shifts.widenScalarIf(
1199       [=](const LegalityQuery &Query) {
1200         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1201         // 32-bit amount.
1202         const LLT ValTy = Query.Types[0];
1203         const LLT AmountTy = Query.Types[1];
1204         return ValTy.getSizeInBits() <= 16 &&
1205                AmountTy.getSizeInBits() < 16;
1206       }, changeTo(1, S16));
1207     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1208     Shifts.clampScalar(1, S32, S32);
1209     Shifts.clampScalar(0, S16, S64);
1210     Shifts.widenScalarToNextPow2(0, 16);
1211   } else {
1212     // Make sure we legalize the shift amount type first, as the general
1213     // expansion for the shifted type will produce much worse code if it hasn't
1214     // been truncated already.
1215     Shifts.clampScalar(1, S32, S32);
1216     Shifts.clampScalar(0, S32, S64);
1217     Shifts.widenScalarToNextPow2(0, 32);
1218   }
1219   Shifts.scalarize(0);
1220 
1221   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1222     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1223     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1224     unsigned IdxTypeIdx = 2;
1225 
1226     getActionDefinitionsBuilder(Op)
1227       .customIf([=](const LegalityQuery &Query) {
1228           const LLT EltTy = Query.Types[EltTypeIdx];
1229           const LLT VecTy = Query.Types[VecTypeIdx];
1230           const LLT IdxTy = Query.Types[IdxTypeIdx];
1231           return (EltTy.getSizeInBits() == 16 ||
1232                   EltTy.getSizeInBits() % 32 == 0) &&
1233                  VecTy.getSizeInBits() % 32 == 0 &&
1234                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1235                  IdxTy.getSizeInBits() == 32;
1236         })
1237       .clampScalar(EltTypeIdx, S32, S64)
1238       .clampScalar(VecTypeIdx, S32, S64)
1239       .clampScalar(IdxTypeIdx, S32, S32);
1240   }
1241 
1242   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1243     .unsupportedIf([=](const LegalityQuery &Query) {
1244         const LLT &EltTy = Query.Types[1].getElementType();
1245         return Query.Types[0] != EltTy;
1246       });
1247 
1248   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1249     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1250     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1251 
1252     // FIXME: Doesn't handle extract of illegal sizes.
1253     getActionDefinitionsBuilder(Op)
1254       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1255       // FIXME: Multiples of 16 should not be legal.
1256       .legalIf([=](const LegalityQuery &Query) {
1257           const LLT BigTy = Query.Types[BigTyIdx];
1258           const LLT LitTy = Query.Types[LitTyIdx];
1259           return (BigTy.getSizeInBits() % 32 == 0) &&
1260                  (LitTy.getSizeInBits() % 16 == 0);
1261         })
1262       .widenScalarIf(
1263         [=](const LegalityQuery &Query) {
1264           const LLT BigTy = Query.Types[BigTyIdx];
1265           return (BigTy.getScalarSizeInBits() < 16);
1266         },
1267         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1268       .widenScalarIf(
1269         [=](const LegalityQuery &Query) {
1270           const LLT LitTy = Query.Types[LitTyIdx];
1271           return (LitTy.getScalarSizeInBits() < 16);
1272         },
1273         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1274       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1275       .widenScalarToNextPow2(BigTyIdx, 32);
1276 
1277   }
1278 
1279   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1280     .legalForCartesianProduct(AllS32Vectors, {S32})
1281     .legalForCartesianProduct(AllS64Vectors, {S64})
1282     .clampNumElements(0, V16S32, V32S32)
1283     .clampNumElements(0, V2S64, V16S64)
1284     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1285 
1286   if (ST.hasScalarPackInsts()) {
1287     BuildVector
1288       // FIXME: Should probably widen s1 vectors straight to s32
1289       .minScalarOrElt(0, S16)
1290       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1291       .minScalar(1, S32);
1292 
1293     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1294       .legalFor({V2S16, S32})
1295       .lower();
1296     BuildVector.minScalarOrElt(0, S32);
1297   } else {
1298     BuildVector.customFor({V2S16, S16});
1299     BuildVector.minScalarOrElt(0, S32);
1300 
1301     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1302       .customFor({V2S16, S32})
1303       .lower();
1304   }
1305 
1306   BuildVector.legalIf(isRegisterType(0));
1307 
1308   // FIXME: Clamp maximum size
1309   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1310     .legalIf(isRegisterType(0));
1311 
1312   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1313   // pre-legalize.
1314   if (ST.hasVOP3PInsts()) {
1315     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1316       .customFor({V2S16, V2S16})
1317       .lower();
1318   } else
1319     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1320 
1321   // Merge/Unmerge
1322   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1323     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1324     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1325 
1326     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1327       const LLT Ty = Query.Types[TypeIdx];
1328       if (Ty.isVector()) {
1329         const LLT &EltTy = Ty.getElementType();
1330         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1331           return true;
1332         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1333           return true;
1334       }
1335       return false;
1336     };
1337 
1338     auto &Builder = getActionDefinitionsBuilder(Op)
1339       .lowerFor({{S16, V2S16}})
1340       .lowerIf([=](const LegalityQuery &Query) {
1341           const LLT BigTy = Query.Types[BigTyIdx];
1342           return BigTy.getSizeInBits() == 32;
1343         })
1344       // Try to widen to s16 first for small types.
1345       // TODO: Only do this on targets with legal s16 shifts
1346       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1347       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1348       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1349       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1350                            elementTypeIs(1, S16)),
1351                        changeTo(1, V2S16))
1352       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1353       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1354       // valid.
1355       .clampScalar(LitTyIdx, S32, S512)
1356       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1357       // Break up vectors with weird elements into scalars
1358       .fewerElementsIf(
1359         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1360         scalarize(0))
1361       .fewerElementsIf(
1362         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1363         scalarize(1))
1364       .clampScalar(BigTyIdx, S32, MaxScalar);
1365 
1366     if (Op == G_MERGE_VALUES) {
1367       Builder.widenScalarIf(
1368         // TODO: Use 16-bit shifts if legal for 8-bit values?
1369         [=](const LegalityQuery &Query) {
1370           const LLT Ty = Query.Types[LitTyIdx];
1371           return Ty.getSizeInBits() < 32;
1372         },
1373         changeTo(LitTyIdx, S32));
1374     }
1375 
1376     Builder.widenScalarIf(
1377       [=](const LegalityQuery &Query) {
1378         const LLT Ty = Query.Types[BigTyIdx];
1379         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1380           Ty.getSizeInBits() % 16 != 0;
1381       },
1382       [=](const LegalityQuery &Query) {
1383         // Pick the next power of 2, or a multiple of 64 over 128.
1384         // Whichever is smaller.
1385         const LLT &Ty = Query.Types[BigTyIdx];
1386         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1387         if (NewSizeInBits >= 256) {
1388           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1389           if (RoundedTo < NewSizeInBits)
1390             NewSizeInBits = RoundedTo;
1391         }
1392         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1393       })
1394       .legalIf([=](const LegalityQuery &Query) {
1395           const LLT &BigTy = Query.Types[BigTyIdx];
1396           const LLT &LitTy = Query.Types[LitTyIdx];
1397 
1398           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1399             return false;
1400           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1401             return false;
1402 
1403           return BigTy.getSizeInBits() % 16 == 0 &&
1404                  LitTy.getSizeInBits() % 16 == 0 &&
1405                  BigTy.getSizeInBits() <= MaxRegisterSize;
1406         })
1407       // Any vectors left are the wrong size. Scalarize them.
1408       .scalarize(0)
1409       .scalarize(1);
1410   }
1411 
1412   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1413   // RegBankSelect.
1414   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1415     .legalFor({{S32}, {S64}});
1416 
1417   if (ST.hasVOP3PInsts()) {
1418     SextInReg.lowerFor({{V2S16}})
1419       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1420       // get more vector shift opportunities, since we'll get those when
1421       // expanded.
1422       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1423   } else if (ST.has16BitInsts()) {
1424     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1425   } else {
1426     // Prefer to promote to s32 before lowering if we don't have 16-bit
1427     // shifts. This avoid a lot of intermediate truncate and extend operations.
1428     SextInReg.lowerFor({{S32}, {S64}});
1429   }
1430 
1431   // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
1432   // available, and is selectively legal for s16, s32, v2s16.
1433   getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
1434     .scalarize(0)
1435     .clampScalar(0, S16, S32);
1436 
1437   SextInReg
1438     .scalarize(0)
1439     .clampScalar(0, S32, S64)
1440     .lower();
1441 
1442   getActionDefinitionsBuilder(G_FSHR)
1443     .legalFor({{S32, S32}})
1444     .scalarize(0)
1445     .lower();
1446 
1447   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1448     .legalFor({S64});
1449 
1450   getActionDefinitionsBuilder({
1451       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1452       G_FCOPYSIGN,
1453 
1454       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1455       G_READ_REGISTER,
1456       G_WRITE_REGISTER,
1457 
1458       G_SADDO, G_SSUBO,
1459 
1460        // TODO: Implement
1461       G_FMINIMUM, G_FMAXIMUM,
1462       G_FSHL
1463     }).lower();
1464 
1465   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1466         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1467         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1468     .unsupported();
1469 
1470   computeTables();
1471   verify(*ST.getInstrInfo());
1472 }
1473 
1474 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1475                                          MachineInstr &MI) const {
1476   MachineIRBuilder &B = Helper.MIRBuilder;
1477   MachineRegisterInfo &MRI = *B.getMRI();
1478   GISelChangeObserver &Observer = Helper.Observer;
1479 
1480   switch (MI.getOpcode()) {
1481   case TargetOpcode::G_ADDRSPACE_CAST:
1482     return legalizeAddrSpaceCast(MI, MRI, B);
1483   case TargetOpcode::G_FRINT:
1484     return legalizeFrint(MI, MRI, B);
1485   case TargetOpcode::G_FCEIL:
1486     return legalizeFceil(MI, MRI, B);
1487   case TargetOpcode::G_INTRINSIC_TRUNC:
1488     return legalizeIntrinsicTrunc(MI, MRI, B);
1489   case TargetOpcode::G_SITOFP:
1490     return legalizeITOFP(MI, MRI, B, true);
1491   case TargetOpcode::G_UITOFP:
1492     return legalizeITOFP(MI, MRI, B, false);
1493   case TargetOpcode::G_FPTOSI:
1494     return legalizeFPTOI(MI, MRI, B, true);
1495   case TargetOpcode::G_FPTOUI:
1496     return legalizeFPTOI(MI, MRI, B, false);
1497   case TargetOpcode::G_FMINNUM:
1498   case TargetOpcode::G_FMAXNUM:
1499   case TargetOpcode::G_FMINNUM_IEEE:
1500   case TargetOpcode::G_FMAXNUM_IEEE:
1501     return legalizeMinNumMaxNum(Helper, MI);
1502   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1503     return legalizeExtractVectorElt(MI, MRI, B);
1504   case TargetOpcode::G_INSERT_VECTOR_ELT:
1505     return legalizeInsertVectorElt(MI, MRI, B);
1506   case TargetOpcode::G_SHUFFLE_VECTOR:
1507     return legalizeShuffleVector(MI, MRI, B);
1508   case TargetOpcode::G_FSIN:
1509   case TargetOpcode::G_FCOS:
1510     return legalizeSinCos(MI, MRI, B);
1511   case TargetOpcode::G_GLOBAL_VALUE:
1512     return legalizeGlobalValue(MI, MRI, B);
1513   case TargetOpcode::G_LOAD:
1514     return legalizeLoad(MI, MRI, B, Observer);
1515   case TargetOpcode::G_FMAD:
1516     return legalizeFMad(MI, MRI, B);
1517   case TargetOpcode::G_FDIV:
1518     return legalizeFDIV(MI, MRI, B);
1519   case TargetOpcode::G_UDIV:
1520   case TargetOpcode::G_UREM:
1521     return legalizeUDIV_UREM(MI, MRI, B);
1522   case TargetOpcode::G_SDIV:
1523   case TargetOpcode::G_SREM:
1524     return legalizeSDIV_SREM(MI, MRI, B);
1525   case TargetOpcode::G_ATOMIC_CMPXCHG:
1526     return legalizeAtomicCmpXChg(MI, MRI, B);
1527   case TargetOpcode::G_FLOG:
1528     return legalizeFlog(MI, B, numbers::ln2f);
1529   case TargetOpcode::G_FLOG10:
1530     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1531   case TargetOpcode::G_FEXP:
1532     return legalizeFExp(MI, B);
1533   case TargetOpcode::G_FPOW:
1534     return legalizeFPow(MI, B);
1535   case TargetOpcode::G_FFLOOR:
1536     return legalizeFFloor(MI, MRI, B);
1537   case TargetOpcode::G_BUILD_VECTOR:
1538     return legalizeBuildVector(MI, MRI, B);
1539   default:
1540     return false;
1541   }
1542 
1543   llvm_unreachable("expected switch to return");
1544 }
1545 
1546 Register AMDGPULegalizerInfo::getSegmentAperture(
1547   unsigned AS,
1548   MachineRegisterInfo &MRI,
1549   MachineIRBuilder &B) const {
1550   MachineFunction &MF = B.getMF();
1551   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1552   const LLT S32 = LLT::scalar(32);
1553 
1554   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1555 
1556   if (ST.hasApertureRegs()) {
1557     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1558     // getreg.
1559     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1560         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1561         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1562     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1563         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1564         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1565     unsigned Encoding =
1566         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1567         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1568         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1569 
1570     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1571 
1572     B.buildInstr(AMDGPU::S_GETREG_B32)
1573       .addDef(GetReg)
1574       .addImm(Encoding);
1575     MRI.setType(GetReg, S32);
1576 
1577     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1578     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1579   }
1580 
1581   Register QueuePtr = MRI.createGenericVirtualRegister(
1582     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1583 
1584   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1585   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1586     return Register();
1587 
1588   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1589   // private_segment_aperture_base_hi.
1590   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1591 
1592   // TODO: can we be smarter about machine pointer info?
1593   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1594   MachineMemOperand *MMO = MF.getMachineMemOperand(
1595       PtrInfo,
1596       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1597           MachineMemOperand::MOInvariant,
1598       4, commonAlignment(Align(64), StructOffset));
1599 
1600   Register LoadAddr;
1601 
1602   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1603   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1604 }
1605 
1606 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1607   MachineInstr &MI, MachineRegisterInfo &MRI,
1608   MachineIRBuilder &B) const {
1609   MachineFunction &MF = B.getMF();
1610 
1611   const LLT S32 = LLT::scalar(32);
1612   Register Dst = MI.getOperand(0).getReg();
1613   Register Src = MI.getOperand(1).getReg();
1614 
1615   LLT DstTy = MRI.getType(Dst);
1616   LLT SrcTy = MRI.getType(Src);
1617   unsigned DestAS = DstTy.getAddressSpace();
1618   unsigned SrcAS = SrcTy.getAddressSpace();
1619 
1620   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1621   // vector element.
1622   assert(!DstTy.isVector());
1623 
1624   const AMDGPUTargetMachine &TM
1625     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1626 
1627   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1628   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1629     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1630     return true;
1631   }
1632 
1633   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1634     // Truncate.
1635     B.buildExtract(Dst, Src, 0);
1636     MI.eraseFromParent();
1637     return true;
1638   }
1639 
1640   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1641     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1642     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1643 
1644     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1645     // another. Merge operands are required to be the same type, but creating an
1646     // extra ptrtoint would be kind of pointless.
1647     auto HighAddr = B.buildConstant(
1648       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1649     B.buildMerge(Dst, {Src, HighAddr});
1650     MI.eraseFromParent();
1651     return true;
1652   }
1653 
1654   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1655     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1656            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1657     unsigned NullVal = TM.getNullPointerValue(DestAS);
1658 
1659     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1660     auto FlatNull = B.buildConstant(SrcTy, 0);
1661 
1662     // Extract low 32-bits of the pointer.
1663     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1664 
1665     auto CmpRes =
1666         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1667     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1668 
1669     MI.eraseFromParent();
1670     return true;
1671   }
1672 
1673   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1674     return false;
1675 
1676   if (!ST.hasFlatAddressSpace())
1677     return false;
1678 
1679   auto SegmentNull =
1680       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1681   auto FlatNull =
1682       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1683 
1684   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1685   if (!ApertureReg.isValid())
1686     return false;
1687 
1688   auto CmpRes =
1689       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1690 
1691   // Coerce the type of the low half of the result so we can use merge_values.
1692   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1693 
1694   // TODO: Should we allow mismatched types but matching sizes in merges to
1695   // avoid the ptrtoint?
1696   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1697   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1698 
1699   MI.eraseFromParent();
1700   return true;
1701 }
1702 
1703 bool AMDGPULegalizerInfo::legalizeFrint(
1704   MachineInstr &MI, MachineRegisterInfo &MRI,
1705   MachineIRBuilder &B) const {
1706   Register Src = MI.getOperand(1).getReg();
1707   LLT Ty = MRI.getType(Src);
1708   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1709 
1710   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1711   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1712 
1713   auto C1 = B.buildFConstant(Ty, C1Val);
1714   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1715 
1716   // TODO: Should this propagate fast-math-flags?
1717   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1718   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1719 
1720   auto C2 = B.buildFConstant(Ty, C2Val);
1721   auto Fabs = B.buildFAbs(Ty, Src);
1722 
1723   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1724   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1725   return true;
1726 }
1727 
1728 bool AMDGPULegalizerInfo::legalizeFceil(
1729   MachineInstr &MI, MachineRegisterInfo &MRI,
1730   MachineIRBuilder &B) const {
1731 
1732   const LLT S1 = LLT::scalar(1);
1733   const LLT S64 = LLT::scalar(64);
1734 
1735   Register Src = MI.getOperand(1).getReg();
1736   assert(MRI.getType(Src) == S64);
1737 
1738   // result = trunc(src)
1739   // if (src > 0.0 && src != result)
1740   //   result += 1.0
1741 
1742   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1743 
1744   const auto Zero = B.buildFConstant(S64, 0.0);
1745   const auto One = B.buildFConstant(S64, 1.0);
1746   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1747   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1748   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1749   auto Add = B.buildSelect(S64, And, One, Zero);
1750 
1751   // TODO: Should this propagate fast-math-flags?
1752   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1753   return true;
1754 }
1755 
1756 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1757                                               MachineIRBuilder &B) {
1758   const unsigned FractBits = 52;
1759   const unsigned ExpBits = 11;
1760   LLT S32 = LLT::scalar(32);
1761 
1762   auto Const0 = B.buildConstant(S32, FractBits - 32);
1763   auto Const1 = B.buildConstant(S32, ExpBits);
1764 
1765   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1766     .addUse(Const0.getReg(0))
1767     .addUse(Const1.getReg(0));
1768 
1769   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1770 }
1771 
1772 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1773   MachineInstr &MI, MachineRegisterInfo &MRI,
1774   MachineIRBuilder &B) const {
1775   const LLT S1 = LLT::scalar(1);
1776   const LLT S32 = LLT::scalar(32);
1777   const LLT S64 = LLT::scalar(64);
1778 
1779   Register Src = MI.getOperand(1).getReg();
1780   assert(MRI.getType(Src) == S64);
1781 
1782   // TODO: Should this use extract since the low half is unused?
1783   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1784   Register Hi = Unmerge.getReg(1);
1785 
1786   // Extract the upper half, since this is where we will find the sign and
1787   // exponent.
1788   auto Exp = extractF64Exponent(Hi, B);
1789 
1790   const unsigned FractBits = 52;
1791 
1792   // Extract the sign bit.
1793   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1794   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1795 
1796   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1797 
1798   const auto Zero32 = B.buildConstant(S32, 0);
1799 
1800   // Extend back to 64-bits.
1801   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1802 
1803   auto Shr = B.buildAShr(S64, FractMask, Exp);
1804   auto Not = B.buildNot(S64, Shr);
1805   auto Tmp0 = B.buildAnd(S64, Src, Not);
1806   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1807 
1808   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1809   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1810 
1811   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1812   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1813   return true;
1814 }
1815 
1816 bool AMDGPULegalizerInfo::legalizeITOFP(
1817   MachineInstr &MI, MachineRegisterInfo &MRI,
1818   MachineIRBuilder &B, bool Signed) const {
1819 
1820   Register Dst = MI.getOperand(0).getReg();
1821   Register Src = MI.getOperand(1).getReg();
1822 
1823   const LLT S64 = LLT::scalar(64);
1824   const LLT S32 = LLT::scalar(32);
1825 
1826   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1827 
1828   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1829 
1830   auto CvtHi = Signed ?
1831     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1832     B.buildUITOFP(S64, Unmerge.getReg(1));
1833 
1834   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1835 
1836   auto ThirtyTwo = B.buildConstant(S32, 32);
1837   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1838     .addUse(CvtHi.getReg(0))
1839     .addUse(ThirtyTwo.getReg(0));
1840 
1841   // TODO: Should this propagate fast-math-flags?
1842   B.buildFAdd(Dst, LdExp, CvtLo);
1843   MI.eraseFromParent();
1844   return true;
1845 }
1846 
1847 // TODO: Copied from DAG implementation. Verify logic and document how this
1848 // actually works.
1849 bool AMDGPULegalizerInfo::legalizeFPTOI(
1850   MachineInstr &MI, MachineRegisterInfo &MRI,
1851   MachineIRBuilder &B, bool Signed) const {
1852 
1853   Register Dst = MI.getOperand(0).getReg();
1854   Register Src = MI.getOperand(1).getReg();
1855 
1856   const LLT S64 = LLT::scalar(64);
1857   const LLT S32 = LLT::scalar(32);
1858 
1859   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1860 
1861   unsigned Flags = MI.getFlags();
1862 
1863   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1864   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1865   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1866 
1867   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1868   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1869   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1870 
1871   auto Hi = Signed ?
1872     B.buildFPTOSI(S32, FloorMul) :
1873     B.buildFPTOUI(S32, FloorMul);
1874   auto Lo = B.buildFPTOUI(S32, Fma);
1875 
1876   B.buildMerge(Dst, { Lo, Hi });
1877   MI.eraseFromParent();
1878 
1879   return true;
1880 }
1881 
1882 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1883                                                MachineInstr &MI) const {
1884   MachineFunction &MF = Helper.MIRBuilder.getMF();
1885   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1886 
1887   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1888                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1889 
1890   // With ieee_mode disabled, the instructions have the correct behavior
1891   // already for G_FMINNUM/G_FMAXNUM
1892   if (!MFI->getMode().IEEE)
1893     return !IsIEEEOp;
1894 
1895   if (IsIEEEOp)
1896     return true;
1897 
1898   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1899 }
1900 
1901 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1902   MachineInstr &MI, MachineRegisterInfo &MRI,
1903   MachineIRBuilder &B) const {
1904   // TODO: Should move some of this into LegalizerHelper.
1905 
1906   // TODO: Promote dynamic indexing of s16 to s32
1907 
1908   // FIXME: Artifact combiner probably should have replaced the truncated
1909   // constant before this, so we shouldn't need
1910   // getConstantVRegValWithLookThrough.
1911   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1912     MI.getOperand(2).getReg(), MRI);
1913   if (!IdxVal) // Dynamic case will be selected to register indexing.
1914     return true;
1915 
1916   Register Dst = MI.getOperand(0).getReg();
1917   Register Vec = MI.getOperand(1).getReg();
1918 
1919   LLT VecTy = MRI.getType(Vec);
1920   LLT EltTy = VecTy.getElementType();
1921   assert(EltTy == MRI.getType(Dst));
1922 
1923   if (IdxVal->Value < VecTy.getNumElements())
1924     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1925   else
1926     B.buildUndef(Dst);
1927 
1928   MI.eraseFromParent();
1929   return true;
1930 }
1931 
1932 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1933   MachineInstr &MI, MachineRegisterInfo &MRI,
1934   MachineIRBuilder &B) const {
1935   // TODO: Should move some of this into LegalizerHelper.
1936 
1937   // TODO: Promote dynamic indexing of s16 to s32
1938 
1939   // FIXME: Artifact combiner probably should have replaced the truncated
1940   // constant before this, so we shouldn't need
1941   // getConstantVRegValWithLookThrough.
1942   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1943     MI.getOperand(3).getReg(), MRI);
1944   if (!IdxVal) // Dynamic case will be selected to register indexing.
1945     return true;
1946 
1947   Register Dst = MI.getOperand(0).getReg();
1948   Register Vec = MI.getOperand(1).getReg();
1949   Register Ins = MI.getOperand(2).getReg();
1950 
1951   LLT VecTy = MRI.getType(Vec);
1952   LLT EltTy = VecTy.getElementType();
1953   assert(EltTy == MRI.getType(Ins));
1954 
1955   if (IdxVal->Value < VecTy.getNumElements())
1956     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1957   else
1958     B.buildUndef(Dst);
1959 
1960   MI.eraseFromParent();
1961   return true;
1962 }
1963 
1964 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1965   MachineInstr &MI, MachineRegisterInfo &MRI,
1966   MachineIRBuilder &B) const {
1967   const LLT V2S16 = LLT::vector(2, 16);
1968 
1969   Register Dst = MI.getOperand(0).getReg();
1970   Register Src0 = MI.getOperand(1).getReg();
1971   LLT DstTy = MRI.getType(Dst);
1972   LLT SrcTy = MRI.getType(Src0);
1973 
1974   if (SrcTy == V2S16 && DstTy == V2S16 &&
1975       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1976     return true;
1977 
1978   MachineIRBuilder HelperBuilder(MI);
1979   GISelObserverWrapper DummyObserver;
1980   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1981   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1982 }
1983 
1984 bool AMDGPULegalizerInfo::legalizeSinCos(
1985   MachineInstr &MI, MachineRegisterInfo &MRI,
1986   MachineIRBuilder &B) const {
1987 
1988   Register DstReg = MI.getOperand(0).getReg();
1989   Register SrcReg = MI.getOperand(1).getReg();
1990   LLT Ty = MRI.getType(DstReg);
1991   unsigned Flags = MI.getFlags();
1992 
1993   Register TrigVal;
1994   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1995   if (ST.hasTrigReducedRange()) {
1996     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1997     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1998       .addUse(MulVal.getReg(0))
1999       .setMIFlags(Flags).getReg(0);
2000   } else
2001     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2002 
2003   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2004     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2005   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2006     .addUse(TrigVal)
2007     .setMIFlags(Flags);
2008   MI.eraseFromParent();
2009   return true;
2010 }
2011 
2012 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2013                                                   MachineIRBuilder &B,
2014                                                   const GlobalValue *GV,
2015                                                   int64_t Offset,
2016                                                   unsigned GAFlags) const {
2017   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2018   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2019   // to the following code sequence:
2020   //
2021   // For constant address space:
2022   //   s_getpc_b64 s[0:1]
2023   //   s_add_u32 s0, s0, $symbol
2024   //   s_addc_u32 s1, s1, 0
2025   //
2026   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2027   //   a fixup or relocation is emitted to replace $symbol with a literal
2028   //   constant, which is a pc-relative offset from the encoding of the $symbol
2029   //   operand to the global variable.
2030   //
2031   // For global address space:
2032   //   s_getpc_b64 s[0:1]
2033   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2034   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2035   //
2036   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2037   //   fixups or relocations are emitted to replace $symbol@*@lo and
2038   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2039   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2040   //   operand to the global variable.
2041   //
2042   // What we want here is an offset from the value returned by s_getpc
2043   // (which is the address of the s_add_u32 instruction) to the global
2044   // variable, but since the encoding of $symbol starts 4 bytes after the start
2045   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2046   // small. This requires us to add 4 to the global variable offset in order to
2047   // compute the correct address.
2048 
2049   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2050 
2051   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2052     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2053 
2054   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2055     .addDef(PCReg);
2056 
2057   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2058   if (GAFlags == SIInstrInfo::MO_NONE)
2059     MIB.addImm(0);
2060   else
2061     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2062 
2063   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2064 
2065   if (PtrTy.getSizeInBits() == 32)
2066     B.buildExtract(DstReg, PCReg, 0);
2067   return true;
2068  }
2069 
2070 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2071   MachineInstr &MI, MachineRegisterInfo &MRI,
2072   MachineIRBuilder &B) const {
2073   Register DstReg = MI.getOperand(0).getReg();
2074   LLT Ty = MRI.getType(DstReg);
2075   unsigned AS = Ty.getAddressSpace();
2076 
2077   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2078   MachineFunction &MF = B.getMF();
2079   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2080 
2081   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2082     if (!MFI->isEntryFunction()) {
2083       const Function &Fn = MF.getFunction();
2084       DiagnosticInfoUnsupported BadLDSDecl(
2085         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2086         DS_Warning);
2087       Fn.getContext().diagnose(BadLDSDecl);
2088 
2089       // We currently don't have a way to correctly allocate LDS objects that
2090       // aren't directly associated with a kernel. We do force inlining of
2091       // functions that use local objects. However, if these dead functions are
2092       // not eliminated, we don't want a compile time error. Just emit a warning
2093       // and a trap, since there should be no callable path here.
2094       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2095       B.buildUndef(DstReg);
2096       MI.eraseFromParent();
2097       return true;
2098     }
2099 
2100     // TODO: We could emit code to handle the initialization somewhere.
2101     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2102       const SITargetLowering *TLI = ST.getTargetLowering();
2103       if (!TLI->shouldUseLDSConstAddress(GV)) {
2104         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2105         return true; // Leave in place;
2106       }
2107 
2108       B.buildConstant(
2109           DstReg,
2110           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2111       MI.eraseFromParent();
2112       return true;
2113     }
2114 
2115     const Function &Fn = MF.getFunction();
2116     DiagnosticInfoUnsupported BadInit(
2117       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2118     Fn.getContext().diagnose(BadInit);
2119     return true;
2120   }
2121 
2122   const SITargetLowering *TLI = ST.getTargetLowering();
2123 
2124   if (TLI->shouldEmitFixup(GV)) {
2125     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2126     MI.eraseFromParent();
2127     return true;
2128   }
2129 
2130   if (TLI->shouldEmitPCReloc(GV)) {
2131     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2132     MI.eraseFromParent();
2133     return true;
2134   }
2135 
2136   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2137   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2138 
2139   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2140       MachinePointerInfo::getGOT(MF),
2141       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2142           MachineMemOperand::MOInvariant,
2143       8 /*Size*/, Align(8));
2144 
2145   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2146 
2147   if (Ty.getSizeInBits() == 32) {
2148     // Truncate if this is a 32-bit constant adrdess.
2149     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2150     B.buildExtract(DstReg, Load, 0);
2151   } else
2152     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2153 
2154   MI.eraseFromParent();
2155   return true;
2156 }
2157 
2158 bool AMDGPULegalizerInfo::legalizeLoad(
2159   MachineInstr &MI, MachineRegisterInfo &MRI,
2160   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2161   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2162   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2163   Observer.changingInstr(MI);
2164   MI.getOperand(1).setReg(Cast.getReg(0));
2165   Observer.changedInstr(MI);
2166   return true;
2167 }
2168 
2169 bool AMDGPULegalizerInfo::legalizeFMad(
2170   MachineInstr &MI, MachineRegisterInfo &MRI,
2171   MachineIRBuilder &B) const {
2172   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2173   assert(Ty.isScalar());
2174 
2175   MachineFunction &MF = B.getMF();
2176   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2177 
2178   // TODO: Always legal with future ftz flag.
2179   // FIXME: Do we need just output?
2180   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2181     return true;
2182   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2183     return true;
2184 
2185   MachineIRBuilder HelperBuilder(MI);
2186   GISelObserverWrapper DummyObserver;
2187   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2188   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2189 }
2190 
2191 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2192   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2193   Register DstReg = MI.getOperand(0).getReg();
2194   Register PtrReg = MI.getOperand(1).getReg();
2195   Register CmpVal = MI.getOperand(2).getReg();
2196   Register NewVal = MI.getOperand(3).getReg();
2197 
2198   assert(SITargetLowering::isFlatGlobalAddrSpace(
2199            MRI.getType(PtrReg).getAddressSpace()) &&
2200          "this should not have been custom lowered");
2201 
2202   LLT ValTy = MRI.getType(CmpVal);
2203   LLT VecTy = LLT::vector(2, ValTy);
2204 
2205   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2206 
2207   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2208     .addDef(DstReg)
2209     .addUse(PtrReg)
2210     .addUse(PackedVal)
2211     .setMemRefs(MI.memoperands());
2212 
2213   MI.eraseFromParent();
2214   return true;
2215 }
2216 
2217 bool AMDGPULegalizerInfo::legalizeFlog(
2218   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2219   Register Dst = MI.getOperand(0).getReg();
2220   Register Src = MI.getOperand(1).getReg();
2221   LLT Ty = B.getMRI()->getType(Dst);
2222   unsigned Flags = MI.getFlags();
2223 
2224   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2225   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2226 
2227   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2228   MI.eraseFromParent();
2229   return true;
2230 }
2231 
2232 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2233                                        MachineIRBuilder &B) const {
2234   Register Dst = MI.getOperand(0).getReg();
2235   Register Src = MI.getOperand(1).getReg();
2236   unsigned Flags = MI.getFlags();
2237   LLT Ty = B.getMRI()->getType(Dst);
2238 
2239   auto K = B.buildFConstant(Ty, numbers::log2e);
2240   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2241   B.buildFExp2(Dst, Mul, Flags);
2242   MI.eraseFromParent();
2243   return true;
2244 }
2245 
2246 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2247                                        MachineIRBuilder &B) const {
2248   Register Dst = MI.getOperand(0).getReg();
2249   Register Src0 = MI.getOperand(1).getReg();
2250   Register Src1 = MI.getOperand(2).getReg();
2251   unsigned Flags = MI.getFlags();
2252   LLT Ty = B.getMRI()->getType(Dst);
2253   const LLT S16 = LLT::scalar(16);
2254   const LLT S32 = LLT::scalar(32);
2255 
2256   if (Ty == S32) {
2257     auto Log = B.buildFLog2(S32, Src0, Flags);
2258     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2259       .addUse(Log.getReg(0))
2260       .addUse(Src1)
2261       .setMIFlags(Flags);
2262     B.buildFExp2(Dst, Mul, Flags);
2263   } else if (Ty == S16) {
2264     // There's no f16 fmul_legacy, so we need to convert for it.
2265     auto Log = B.buildFLog2(S16, Src0, Flags);
2266     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2267     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2268     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2269       .addUse(Ext0.getReg(0))
2270       .addUse(Ext1.getReg(0))
2271       .setMIFlags(Flags);
2272 
2273     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2274   } else
2275     return false;
2276 
2277   MI.eraseFromParent();
2278   return true;
2279 }
2280 
2281 // Find a source register, ignoring any possible source modifiers.
2282 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2283   Register ModSrc = OrigSrc;
2284   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2285     ModSrc = SrcFNeg->getOperand(1).getReg();
2286     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2287       ModSrc = SrcFAbs->getOperand(1).getReg();
2288   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2289     ModSrc = SrcFAbs->getOperand(1).getReg();
2290   return ModSrc;
2291 }
2292 
2293 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2294                                          MachineRegisterInfo &MRI,
2295                                          MachineIRBuilder &B) const {
2296 
2297   const LLT S1 = LLT::scalar(1);
2298   const LLT S64 = LLT::scalar(64);
2299   Register Dst = MI.getOperand(0).getReg();
2300   Register OrigSrc = MI.getOperand(1).getReg();
2301   unsigned Flags = MI.getFlags();
2302   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2303          "this should not have been custom lowered");
2304 
2305   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2306   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2307   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2308   // V_FRACT bug is:
2309   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2310   //
2311   // Convert floor(x) to (x - fract(x))
2312 
2313   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2314     .addUse(OrigSrc)
2315     .setMIFlags(Flags);
2316 
2317   // Give source modifier matching some assistance before obscuring a foldable
2318   // pattern.
2319 
2320   // TODO: We can avoid the neg on the fract? The input sign to fract
2321   // shouldn't matter?
2322   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2323 
2324   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2325 
2326   Register Min = MRI.createGenericVirtualRegister(S64);
2327 
2328   // We don't need to concern ourselves with the snan handling difference, so
2329   // use the one which will directly select.
2330   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2331   if (MFI->getMode().IEEE)
2332     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2333   else
2334     B.buildFMinNum(Min, Fract, Const, Flags);
2335 
2336   Register CorrectedFract = Min;
2337   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2338     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2339     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2340   }
2341 
2342   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2343   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2344 
2345   MI.eraseFromParent();
2346   return true;
2347 }
2348 
2349 // Turn an illegal packed v2s16 build vector into bit operations.
2350 // TODO: This should probably be a bitcast action in LegalizerHelper.
2351 bool AMDGPULegalizerInfo::legalizeBuildVector(
2352   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2353   Register Dst = MI.getOperand(0).getReg();
2354   const LLT S32 = LLT::scalar(32);
2355   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2356 
2357   Register Src0 = MI.getOperand(1).getReg();
2358   Register Src1 = MI.getOperand(2).getReg();
2359   assert(MRI.getType(Src0) == LLT::scalar(16));
2360 
2361   auto Merge = B.buildMerge(S32, {Src0, Src1});
2362   B.buildBitcast(Dst, Merge);
2363 
2364   MI.eraseFromParent();
2365   return true;
2366 }
2367 
2368 // Return the use branch instruction, otherwise null if the usage is invalid.
2369 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2370                                        MachineRegisterInfo &MRI,
2371                                        MachineInstr *&Br,
2372                                        MachineBasicBlock *&UncondBrTarget) {
2373   Register CondDef = MI.getOperand(0).getReg();
2374   if (!MRI.hasOneNonDBGUse(CondDef))
2375     return nullptr;
2376 
2377   MachineBasicBlock *Parent = MI.getParent();
2378   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2379   if (UseMI.getParent() != Parent ||
2380       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2381     return nullptr;
2382 
2383   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2384   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2385   if (Next == Parent->end()) {
2386     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2387     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2388       return nullptr;
2389     UncondBrTarget = &*NextMBB;
2390   } else {
2391     if (Next->getOpcode() != AMDGPU::G_BR)
2392       return nullptr;
2393     Br = &*Next;
2394     UncondBrTarget = Br->getOperand(0).getMBB();
2395   }
2396 
2397   return &UseMI;
2398 }
2399 
2400 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2401                                                MachineRegisterInfo &MRI,
2402                                                Register LiveIn,
2403                                                Register PhyReg) const {
2404   assert(PhyReg.isPhysical() && "Physical register expected");
2405 
2406   // Insert the live-in copy, if required, by defining destination virtual
2407   // register.
2408   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2409   if (!MRI.getVRegDef(LiveIn)) {
2410     // FIXME: Should have scoped insert pt
2411     MachineBasicBlock &OrigInsBB = B.getMBB();
2412     auto OrigInsPt = B.getInsertPt();
2413 
2414     MachineBasicBlock &EntryMBB = B.getMF().front();
2415     EntryMBB.addLiveIn(PhyReg);
2416     B.setInsertPt(EntryMBB, EntryMBB.begin());
2417     B.buildCopy(LiveIn, PhyReg);
2418 
2419     B.setInsertPt(OrigInsBB, OrigInsPt);
2420   }
2421 
2422   return LiveIn;
2423 }
2424 
2425 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2426                                                 MachineRegisterInfo &MRI,
2427                                                 Register PhyReg, LLT Ty,
2428                                                 bool InsertLiveInCopy) const {
2429   assert(PhyReg.isPhysical() && "Physical register expected");
2430 
2431   // Get or create virtual live-in regester
2432   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2433   if (!LiveIn) {
2434     LiveIn = MRI.createGenericVirtualRegister(Ty);
2435     MRI.addLiveIn(PhyReg, LiveIn);
2436   }
2437 
2438   // When the actual true copy required is from virtual register to physical
2439   // register (to be inserted later), live-in copy insertion from physical
2440   // to register virtual register is not required
2441   if (!InsertLiveInCopy)
2442     return LiveIn;
2443 
2444   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2445 }
2446 
2447 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2448     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2449   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2450   const ArgDescriptor *Arg;
2451   const TargetRegisterClass *RC;
2452   LLT ArgTy;
2453   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2454   if (!Arg) {
2455     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2456     return nullptr;
2457   }
2458   return Arg;
2459 }
2460 
2461 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2462                                          const ArgDescriptor *Arg) const {
2463   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2464     return false; // TODO: Handle these
2465 
2466   Register SrcReg = Arg->getRegister();
2467   assert(SrcReg.isPhysical() && "Physical register expected");
2468   assert(DstReg.isVirtual() && "Virtual register expected");
2469 
2470   MachineRegisterInfo &MRI = *B.getMRI();
2471 
2472   LLT Ty = MRI.getType(DstReg);
2473   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2474 
2475   if (Arg->isMasked()) {
2476     // TODO: Should we try to emit this once in the entry block?
2477     const LLT S32 = LLT::scalar(32);
2478     const unsigned Mask = Arg->getMask();
2479     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2480 
2481     Register AndMaskSrc = LiveIn;
2482 
2483     if (Shift != 0) {
2484       auto ShiftAmt = B.buildConstant(S32, Shift);
2485       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2486     }
2487 
2488     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2489   } else {
2490     B.buildCopy(DstReg, LiveIn);
2491   }
2492 
2493   return true;
2494 }
2495 
2496 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2497     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2498     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2499 
2500   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2501   if (!Arg)
2502     return false;
2503 
2504   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2505     return false;
2506 
2507   MI.eraseFromParent();
2508   return true;
2509 }
2510 
2511 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2512                                        MachineRegisterInfo &MRI,
2513                                        MachineIRBuilder &B) const {
2514   Register Dst = MI.getOperand(0).getReg();
2515   LLT DstTy = MRI.getType(Dst);
2516   LLT S16 = LLT::scalar(16);
2517   LLT S32 = LLT::scalar(32);
2518   LLT S64 = LLT::scalar(64);
2519 
2520   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2521     return true;
2522 
2523   if (DstTy == S16)
2524     return legalizeFDIV16(MI, MRI, B);
2525   if (DstTy == S32)
2526     return legalizeFDIV32(MI, MRI, B);
2527   if (DstTy == S64)
2528     return legalizeFDIV64(MI, MRI, B);
2529 
2530   return false;
2531 }
2532 
2533 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2534                                                   Register DstReg,
2535                                                   Register X,
2536                                                   Register Y,
2537                                                   bool IsDiv) const {
2538   const LLT S1 = LLT::scalar(1);
2539   const LLT S32 = LLT::scalar(32);
2540 
2541   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2542   // algorithm used here.
2543 
2544   // Initial estimate of inv(y).
2545   auto FloatY = B.buildUITOFP(S32, Y);
2546   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2547   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2548   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2549   auto Z = B.buildFPTOUI(S32, ScaledY);
2550 
2551   // One round of UNR.
2552   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2553   auto NegYZ = B.buildMul(S32, NegY, Z);
2554   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2555 
2556   // Quotient/remainder estimate.
2557   auto Q = B.buildUMulH(S32, X, Z);
2558   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2559 
2560   // First quotient/remainder refinement.
2561   auto One = B.buildConstant(S32, 1);
2562   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2563   if (IsDiv)
2564     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2565   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2566 
2567   // Second quotient/remainder refinement.
2568   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2569   if (IsDiv)
2570     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2571   else
2572     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2573 }
2574 
2575 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2576                                               MachineRegisterInfo &MRI,
2577                                               MachineIRBuilder &B) const {
2578   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2579   Register DstReg = MI.getOperand(0).getReg();
2580   Register Num = MI.getOperand(1).getReg();
2581   Register Den = MI.getOperand(2).getReg();
2582   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2583   MI.eraseFromParent();
2584   return true;
2585 }
2586 
2587 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2588 //
2589 // Return lo, hi of result
2590 //
2591 // %cvt.lo = G_UITOFP Val.lo
2592 // %cvt.hi = G_UITOFP Val.hi
2593 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2594 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2595 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2596 // %mul2 = G_FMUL %mul1, 2**(-32)
2597 // %trunc = G_INTRINSIC_TRUNC %mul2
2598 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2599 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2600 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2601                                                        Register Val) {
2602   const LLT S32 = LLT::scalar(32);
2603   auto Unmerge = B.buildUnmerge(S32, Val);
2604 
2605   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2606   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2607 
2608   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2609                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2610 
2611   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2612   auto Mul1 =
2613       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2614 
2615   // 2**(-32)
2616   auto Mul2 =
2617       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2618   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2619 
2620   // -(2**32)
2621   auto Mad2 = B.buildFMAD(S32, Trunc,
2622                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2623 
2624   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2625   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2626 
2627   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2628 }
2629 
2630 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2631                                                   Register DstReg,
2632                                                   Register Numer,
2633                                                   Register Denom,
2634                                                   bool IsDiv) const {
2635   const LLT S32 = LLT::scalar(32);
2636   const LLT S64 = LLT::scalar(64);
2637   const LLT S1 = LLT::scalar(1);
2638   Register RcpLo, RcpHi;
2639 
2640   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2641 
2642   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2643 
2644   auto Zero64 = B.buildConstant(S64, 0);
2645   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2646 
2647   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2648   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2649 
2650   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2651   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2652   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2653 
2654   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2655   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2656   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2657   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2658 
2659   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2660   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2661   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2662   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2663   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2664 
2665   auto Zero32 = B.buildConstant(S32, 0);
2666   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2667   auto Add2_HiC =
2668       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2669   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2670   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2671 
2672   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2673   Register NumerLo = UnmergeNumer.getReg(0);
2674   Register NumerHi = UnmergeNumer.getReg(1);
2675 
2676   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2677   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2678   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2679   Register Mul3_Lo = UnmergeMul3.getReg(0);
2680   Register Mul3_Hi = UnmergeMul3.getReg(1);
2681   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2682   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2683   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2684   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2685 
2686   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2687   Register DenomLo = UnmergeDenom.getReg(0);
2688   Register DenomHi = UnmergeDenom.getReg(1);
2689 
2690   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2691   auto C1 = B.buildSExt(S32, CmpHi);
2692 
2693   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2694   auto C2 = B.buildSExt(S32, CmpLo);
2695 
2696   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2697   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2698 
2699   // TODO: Here and below portions of the code can be enclosed into if/endif.
2700   // Currently control flow is unconditional and we have 4 selects after
2701   // potential endif to substitute PHIs.
2702 
2703   // if C3 != 0 ...
2704   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2705   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2706   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2707   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2708 
2709   auto One64 = B.buildConstant(S64, 1);
2710   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2711 
2712   auto C4 =
2713       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2714   auto C5 =
2715       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2716   auto C6 = B.buildSelect(
2717       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2718 
2719   // if (C6 != 0)
2720   auto Add4 = B.buildAdd(S64, Add3, One64);
2721   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2722 
2723   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2724   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2725   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2726 
2727   // endif C6
2728   // endif C3
2729 
2730   if (IsDiv) {
2731     auto Sel1 = B.buildSelect(
2732         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2733     B.buildSelect(DstReg,
2734                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2735   } else {
2736     auto Sel2 = B.buildSelect(
2737         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2738     B.buildSelect(DstReg,
2739                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2740   }
2741 }
2742 
2743 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2744                                             MachineRegisterInfo &MRI,
2745                                             MachineIRBuilder &B) const {
2746   const LLT S64 = LLT::scalar(64);
2747   const LLT S32 = LLT::scalar(32);
2748   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2749   Register DstReg = MI.getOperand(0).getReg();
2750   Register Num = MI.getOperand(1).getReg();
2751   Register Den = MI.getOperand(2).getReg();
2752   LLT Ty = MRI.getType(DstReg);
2753 
2754   if (Ty == S32)
2755     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2756   else if (Ty == S64)
2757     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2758   else
2759     return false;
2760 
2761   MI.eraseFromParent();
2762   return true;
2763 
2764 }
2765 
2766 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2767                                             MachineRegisterInfo &MRI,
2768                                             MachineIRBuilder &B) const {
2769   const LLT S64 = LLT::scalar(64);
2770   const LLT S32 = LLT::scalar(32);
2771 
2772   Register DstReg = MI.getOperand(0).getReg();
2773   const LLT Ty = MRI.getType(DstReg);
2774   if (Ty != S32 && Ty != S64)
2775     return false;
2776 
2777   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2778 
2779   Register LHS = MI.getOperand(1).getReg();
2780   Register RHS = MI.getOperand(2).getReg();
2781 
2782   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2783   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2784   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2785 
2786   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2787   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2788 
2789   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2790   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2791 
2792   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2793   if (Ty == S32)
2794     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2795   else
2796     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2797 
2798   Register Sign;
2799   if (IsDiv)
2800     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2801   else
2802     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2803 
2804   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2805   B.buildSub(DstReg, UDivRem, Sign);
2806 
2807   MI.eraseFromParent();
2808   return true;
2809 }
2810 
2811 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2812                                                  MachineRegisterInfo &MRI,
2813                                                  MachineIRBuilder &B) const {
2814   Register Res = MI.getOperand(0).getReg();
2815   Register LHS = MI.getOperand(1).getReg();
2816   Register RHS = MI.getOperand(2).getReg();
2817 
2818   uint16_t Flags = MI.getFlags();
2819 
2820   LLT ResTy = MRI.getType(Res);
2821   LLT S32 = LLT::scalar(32);
2822   LLT S64 = LLT::scalar(64);
2823 
2824   const MachineFunction &MF = B.getMF();
2825   bool Unsafe =
2826     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2827 
2828   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2829     return false;
2830 
2831   if (!Unsafe && ResTy == S32 &&
2832       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2833     return false;
2834 
2835   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2836     // 1 / x -> RCP(x)
2837     if (CLHS->isExactlyValue(1.0)) {
2838       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2839         .addUse(RHS)
2840         .setMIFlags(Flags);
2841 
2842       MI.eraseFromParent();
2843       return true;
2844     }
2845 
2846     // -1 / x -> RCP( FNEG(x) )
2847     if (CLHS->isExactlyValue(-1.0)) {
2848       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2849       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2850         .addUse(FNeg.getReg(0))
2851         .setMIFlags(Flags);
2852 
2853       MI.eraseFromParent();
2854       return true;
2855     }
2856   }
2857 
2858   // x / y -> x * (1.0 / y)
2859   if (Unsafe) {
2860     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2861       .addUse(RHS)
2862       .setMIFlags(Flags);
2863     B.buildFMul(Res, LHS, RCP, Flags);
2864 
2865     MI.eraseFromParent();
2866     return true;
2867   }
2868 
2869   return false;
2870 }
2871 
2872 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2873                                          MachineRegisterInfo &MRI,
2874                                          MachineIRBuilder &B) const {
2875   Register Res = MI.getOperand(0).getReg();
2876   Register LHS = MI.getOperand(1).getReg();
2877   Register RHS = MI.getOperand(2).getReg();
2878 
2879   uint16_t Flags = MI.getFlags();
2880 
2881   LLT S16 = LLT::scalar(16);
2882   LLT S32 = LLT::scalar(32);
2883 
2884   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2885   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2886 
2887   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2888     .addUse(RHSExt.getReg(0))
2889     .setMIFlags(Flags);
2890 
2891   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2892   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2893 
2894   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2895     .addUse(RDst.getReg(0))
2896     .addUse(RHS)
2897     .addUse(LHS)
2898     .setMIFlags(Flags);
2899 
2900   MI.eraseFromParent();
2901   return true;
2902 }
2903 
2904 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2905 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2906 static void toggleSPDenormMode(bool Enable,
2907                                MachineIRBuilder &B,
2908                                const GCNSubtarget &ST,
2909                                AMDGPU::SIModeRegisterDefaults Mode) {
2910   // Set SP denorm mode to this value.
2911   unsigned SPDenormMode =
2912     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2913 
2914   if (ST.hasDenormModeInst()) {
2915     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2916     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2917 
2918     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2919     B.buildInstr(AMDGPU::S_DENORM_MODE)
2920       .addImm(NewDenormModeValue);
2921 
2922   } else {
2923     // Select FP32 bit field in mode register.
2924     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2925                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2926                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2927 
2928     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2929       .addImm(SPDenormMode)
2930       .addImm(SPDenormModeBitField);
2931   }
2932 }
2933 
2934 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2935                                          MachineRegisterInfo &MRI,
2936                                          MachineIRBuilder &B) const {
2937   Register Res = MI.getOperand(0).getReg();
2938   Register LHS = MI.getOperand(1).getReg();
2939   Register RHS = MI.getOperand(2).getReg();
2940   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2941   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2942 
2943   uint16_t Flags = MI.getFlags();
2944 
2945   LLT S32 = LLT::scalar(32);
2946   LLT S1 = LLT::scalar(1);
2947 
2948   auto One = B.buildFConstant(S32, 1.0f);
2949 
2950   auto DenominatorScaled =
2951     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2952       .addUse(LHS)
2953       .addUse(RHS)
2954       .addImm(0)
2955       .setMIFlags(Flags);
2956   auto NumeratorScaled =
2957     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2958       .addUse(LHS)
2959       .addUse(RHS)
2960       .addImm(1)
2961       .setMIFlags(Flags);
2962 
2963   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2964     .addUse(DenominatorScaled.getReg(0))
2965     .setMIFlags(Flags);
2966   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2967 
2968   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2969   // aren't modeled as reading it.
2970   if (!Mode.allFP32Denormals())
2971     toggleSPDenormMode(true, B, ST, Mode);
2972 
2973   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2974   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2975   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2976   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2977   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2978   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2979 
2980   if (!Mode.allFP32Denormals())
2981     toggleSPDenormMode(false, B, ST, Mode);
2982 
2983   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2984     .addUse(Fma4.getReg(0))
2985     .addUse(Fma1.getReg(0))
2986     .addUse(Fma3.getReg(0))
2987     .addUse(NumeratorScaled.getReg(1))
2988     .setMIFlags(Flags);
2989 
2990   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2991     .addUse(Fmas.getReg(0))
2992     .addUse(RHS)
2993     .addUse(LHS)
2994     .setMIFlags(Flags);
2995 
2996   MI.eraseFromParent();
2997   return true;
2998 }
2999 
3000 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3001                                          MachineRegisterInfo &MRI,
3002                                          MachineIRBuilder &B) const {
3003   Register Res = MI.getOperand(0).getReg();
3004   Register LHS = MI.getOperand(1).getReg();
3005   Register RHS = MI.getOperand(2).getReg();
3006 
3007   uint16_t Flags = MI.getFlags();
3008 
3009   LLT S64 = LLT::scalar(64);
3010   LLT S1 = LLT::scalar(1);
3011 
3012   auto One = B.buildFConstant(S64, 1.0);
3013 
3014   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3015     .addUse(LHS)
3016     .addUse(RHS)
3017     .addImm(0)
3018     .setMIFlags(Flags);
3019 
3020   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3021 
3022   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3023     .addUse(DivScale0.getReg(0))
3024     .setMIFlags(Flags);
3025 
3026   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3027   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3028   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3029 
3030   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3031     .addUse(LHS)
3032     .addUse(RHS)
3033     .addImm(1)
3034     .setMIFlags(Flags);
3035 
3036   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3037   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3038   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3039 
3040   Register Scale;
3041   if (!ST.hasUsableDivScaleConditionOutput()) {
3042     // Workaround a hardware bug on SI where the condition output from div_scale
3043     // is not usable.
3044 
3045     LLT S32 = LLT::scalar(32);
3046 
3047     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3048     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3049     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3050     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3051 
3052     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3053                               Scale1Unmerge.getReg(1));
3054     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3055                               Scale0Unmerge.getReg(1));
3056     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3057   } else {
3058     Scale = DivScale1.getReg(1);
3059   }
3060 
3061   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3062     .addUse(Fma4.getReg(0))
3063     .addUse(Fma3.getReg(0))
3064     .addUse(Mul.getReg(0))
3065     .addUse(Scale)
3066     .setMIFlags(Flags);
3067 
3068   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3069     .addUse(Fmas.getReg(0))
3070     .addUse(RHS)
3071     .addUse(LHS)
3072     .setMIFlags(Flags);
3073 
3074   MI.eraseFromParent();
3075   return true;
3076 }
3077 
3078 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3079                                                  MachineRegisterInfo &MRI,
3080                                                  MachineIRBuilder &B) const {
3081   Register Res = MI.getOperand(0).getReg();
3082   Register LHS = MI.getOperand(2).getReg();
3083   Register RHS = MI.getOperand(3).getReg();
3084   uint16_t Flags = MI.getFlags();
3085 
3086   LLT S32 = LLT::scalar(32);
3087   LLT S1 = LLT::scalar(1);
3088 
3089   auto Abs = B.buildFAbs(S32, RHS, Flags);
3090   const APFloat C0Val(1.0f);
3091 
3092   auto C0 = B.buildConstant(S32, 0x6f800000);
3093   auto C1 = B.buildConstant(S32, 0x2f800000);
3094   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3095 
3096   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3097   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3098 
3099   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3100 
3101   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3102     .addUse(Mul0.getReg(0))
3103     .setMIFlags(Flags);
3104 
3105   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3106 
3107   B.buildFMul(Res, Sel, Mul1, Flags);
3108 
3109   MI.eraseFromParent();
3110   return true;
3111 }
3112 
3113 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3114                                                  MachineRegisterInfo &MRI,
3115                                                  MachineIRBuilder &B) const {
3116   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3117   if (!MFI->isEntryFunction()) {
3118     return legalizePreloadedArgIntrin(MI, MRI, B,
3119                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3120   }
3121 
3122   uint64_t Offset =
3123     ST.getTargetLowering()->getImplicitParameterOffset(
3124       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3125   Register DstReg = MI.getOperand(0).getReg();
3126   LLT DstTy = MRI.getType(DstReg);
3127   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3128 
3129   const ArgDescriptor *Arg;
3130   const TargetRegisterClass *RC;
3131   LLT ArgTy;
3132   std::tie(Arg, RC, ArgTy) =
3133       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3134   if (!Arg)
3135     return false;
3136 
3137   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3138   if (!loadInputValue(KernargPtrReg, B, Arg))
3139     return false;
3140 
3141   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3142   MI.eraseFromParent();
3143   return true;
3144 }
3145 
3146 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3147                                               MachineRegisterInfo &MRI,
3148                                               MachineIRBuilder &B,
3149                                               unsigned AddrSpace) const {
3150   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3151   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3152   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3153   MI.eraseFromParent();
3154   return true;
3155 }
3156 
3157 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3158 // offset (the offset that is included in bounds checking and swizzling, to be
3159 // split between the instruction's voffset and immoffset fields) and soffset
3160 // (the offset that is excluded from bounds checking and swizzling, to go in
3161 // the instruction's soffset field).  This function takes the first kind of
3162 // offset and figures out how to split it between voffset and immoffset.
3163 std::tuple<Register, unsigned, unsigned>
3164 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3165                                         Register OrigOffset) const {
3166   const unsigned MaxImm = 4095;
3167   Register BaseReg;
3168   unsigned TotalConstOffset;
3169   MachineInstr *OffsetDef;
3170   const LLT S32 = LLT::scalar(32);
3171 
3172   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3173     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3174 
3175   unsigned ImmOffset = TotalConstOffset;
3176 
3177   // If the immediate value is too big for the immoffset field, put the value
3178   // and -4096 into the immoffset field so that the value that is copied/added
3179   // for the voffset field is a multiple of 4096, and it stands more chance
3180   // of being CSEd with the copy/add for another similar load/store.
3181   // However, do not do that rounding down to a multiple of 4096 if that is a
3182   // negative number, as it appears to be illegal to have a negative offset
3183   // in the vgpr, even if adding the immediate offset makes it positive.
3184   unsigned Overflow = ImmOffset & ~MaxImm;
3185   ImmOffset -= Overflow;
3186   if ((int32_t)Overflow < 0) {
3187     Overflow += ImmOffset;
3188     ImmOffset = 0;
3189   }
3190 
3191   if (Overflow != 0) {
3192     if (!BaseReg) {
3193       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3194     } else {
3195       auto OverflowVal = B.buildConstant(S32, Overflow);
3196       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3197     }
3198   }
3199 
3200   if (!BaseReg)
3201     BaseReg = B.buildConstant(S32, 0).getReg(0);
3202 
3203   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3204 }
3205 
3206 /// Handle register layout difference for f16 images for some subtargets.
3207 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3208                                              MachineRegisterInfo &MRI,
3209                                              Register Reg) const {
3210   if (!ST.hasUnpackedD16VMem())
3211     return Reg;
3212 
3213   const LLT S16 = LLT::scalar(16);
3214   const LLT S32 = LLT::scalar(32);
3215   LLT StoreVT = MRI.getType(Reg);
3216   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3217 
3218   auto Unmerge = B.buildUnmerge(S16, Reg);
3219 
3220   SmallVector<Register, 4> WideRegs;
3221   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3222     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3223 
3224   int NumElts = StoreVT.getNumElements();
3225 
3226   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3227 }
3228 
3229 Register AMDGPULegalizerInfo::fixStoreSourceType(
3230   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3231   MachineRegisterInfo *MRI = B.getMRI();
3232   LLT Ty = MRI->getType(VData);
3233 
3234   const LLT S16 = LLT::scalar(16);
3235 
3236   // Fixup illegal register types for i8 stores.
3237   if (Ty == LLT::scalar(8) || Ty == S16) {
3238     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3239     return AnyExt;
3240   }
3241 
3242   if (Ty.isVector()) {
3243     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3244       if (IsFormat)
3245         return handleD16VData(B, *MRI, VData);
3246     }
3247   }
3248 
3249   return VData;
3250 }
3251 
3252 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3253                                               MachineRegisterInfo &MRI,
3254                                               MachineIRBuilder &B,
3255                                               bool IsTyped,
3256                                               bool IsFormat) const {
3257   Register VData = MI.getOperand(1).getReg();
3258   LLT Ty = MRI.getType(VData);
3259   LLT EltTy = Ty.getScalarType();
3260   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3261   const LLT S32 = LLT::scalar(32);
3262 
3263   VData = fixStoreSourceType(B, VData, IsFormat);
3264   Register RSrc = MI.getOperand(2).getReg();
3265 
3266   MachineMemOperand *MMO = *MI.memoperands_begin();
3267   const int MemSize = MMO->getSize();
3268 
3269   unsigned ImmOffset;
3270   unsigned TotalOffset;
3271 
3272   // The typed intrinsics add an immediate after the registers.
3273   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3274 
3275   // The struct intrinsic variants add one additional operand over raw.
3276   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3277   Register VIndex;
3278   int OpOffset = 0;
3279   if (HasVIndex) {
3280     VIndex = MI.getOperand(3).getReg();
3281     OpOffset = 1;
3282   }
3283 
3284   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3285   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3286 
3287   unsigned Format = 0;
3288   if (IsTyped) {
3289     Format = MI.getOperand(5 + OpOffset).getImm();
3290     ++OpOffset;
3291   }
3292 
3293   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3294 
3295   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3296   if (TotalOffset != 0)
3297     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3298 
3299   unsigned Opc;
3300   if (IsTyped) {
3301     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3302                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3303   } else if (IsFormat) {
3304     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3305                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3306   } else {
3307     switch (MemSize) {
3308     case 1:
3309       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3310       break;
3311     case 2:
3312       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3313       break;
3314     default:
3315       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3316       break;
3317     }
3318   }
3319 
3320   if (!VIndex)
3321     VIndex = B.buildConstant(S32, 0).getReg(0);
3322 
3323   auto MIB = B.buildInstr(Opc)
3324     .addUse(VData)              // vdata
3325     .addUse(RSrc)               // rsrc
3326     .addUse(VIndex)             // vindex
3327     .addUse(VOffset)            // voffset
3328     .addUse(SOffset)            // soffset
3329     .addImm(ImmOffset);         // offset(imm)
3330 
3331   if (IsTyped)
3332     MIB.addImm(Format);
3333 
3334   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3335      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3336      .addMemOperand(MMO);
3337 
3338   MI.eraseFromParent();
3339   return true;
3340 }
3341 
3342 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3343                                              MachineRegisterInfo &MRI,
3344                                              MachineIRBuilder &B,
3345                                              bool IsFormat,
3346                                              bool IsTyped) const {
3347   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3348   MachineMemOperand *MMO = *MI.memoperands_begin();
3349   const int MemSize = MMO->getSize();
3350   const LLT S32 = LLT::scalar(32);
3351 
3352   Register Dst = MI.getOperand(0).getReg();
3353   Register RSrc = MI.getOperand(2).getReg();
3354 
3355   // The typed intrinsics add an immediate after the registers.
3356   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3357 
3358   // The struct intrinsic variants add one additional operand over raw.
3359   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3360   Register VIndex;
3361   int OpOffset = 0;
3362   if (HasVIndex) {
3363     VIndex = MI.getOperand(3).getReg();
3364     OpOffset = 1;
3365   }
3366 
3367   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3368   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3369 
3370   unsigned Format = 0;
3371   if (IsTyped) {
3372     Format = MI.getOperand(5 + OpOffset).getImm();
3373     ++OpOffset;
3374   }
3375 
3376   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3377   unsigned ImmOffset;
3378   unsigned TotalOffset;
3379 
3380   LLT Ty = MRI.getType(Dst);
3381   LLT EltTy = Ty.getScalarType();
3382   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3383   const bool Unpacked = ST.hasUnpackedD16VMem();
3384 
3385   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3386   if (TotalOffset != 0)
3387     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3388 
3389   unsigned Opc;
3390 
3391   if (IsTyped) {
3392     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3393                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3394   } else if (IsFormat) {
3395     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3396                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3397   } else {
3398     switch (MemSize) {
3399     case 1:
3400       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3401       break;
3402     case 2:
3403       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3404       break;
3405     default:
3406       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3407       break;
3408     }
3409   }
3410 
3411   Register LoadDstReg;
3412 
3413   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3414   LLT UnpackedTy = Ty.changeElementSize(32);
3415 
3416   if (IsExtLoad)
3417     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3418   else if (Unpacked && IsD16 && Ty.isVector())
3419     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3420   else
3421     LoadDstReg = Dst;
3422 
3423   if (!VIndex)
3424     VIndex = B.buildConstant(S32, 0).getReg(0);
3425 
3426   auto MIB = B.buildInstr(Opc)
3427     .addDef(LoadDstReg)         // vdata
3428     .addUse(RSrc)               // rsrc
3429     .addUse(VIndex)             // vindex
3430     .addUse(VOffset)            // voffset
3431     .addUse(SOffset)            // soffset
3432     .addImm(ImmOffset);         // offset(imm)
3433 
3434   if (IsTyped)
3435     MIB.addImm(Format);
3436 
3437   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3438      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3439      .addMemOperand(MMO);
3440 
3441   if (LoadDstReg != Dst) {
3442     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3443 
3444     // Widen result for extending loads was widened.
3445     if (IsExtLoad)
3446       B.buildTrunc(Dst, LoadDstReg);
3447     else {
3448       // Repack to original 16-bit vector result
3449       // FIXME: G_TRUNC should work, but legalization currently fails
3450       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3451       SmallVector<Register, 4> Repack;
3452       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3453         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3454       B.buildMerge(Dst, Repack);
3455     }
3456   }
3457 
3458   MI.eraseFromParent();
3459   return true;
3460 }
3461 
3462 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3463                                                MachineIRBuilder &B,
3464                                                bool IsInc) const {
3465   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3466                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3467   B.buildInstr(Opc)
3468     .addDef(MI.getOperand(0).getReg())
3469     .addUse(MI.getOperand(2).getReg())
3470     .addUse(MI.getOperand(3).getReg())
3471     .cloneMemRefs(MI);
3472   MI.eraseFromParent();
3473   return true;
3474 }
3475 
3476 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3477   switch (IntrID) {
3478   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3479   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3480     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3481   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3482   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3483     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3484   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3485   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3486     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3487   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3488   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3489     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3490   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3491   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3492     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3493   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3494   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3495     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3496   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3497   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3498     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3499   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3500   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3501     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3502   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3503   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3504     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3505   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3506   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3507     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3508   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3509   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3510     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3511   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3512   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3513     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3514   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3515   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3516     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3517   default:
3518     llvm_unreachable("unhandled atomic opcode");
3519   }
3520 }
3521 
3522 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3523                                                MachineIRBuilder &B,
3524                                                Intrinsic::ID IID) const {
3525   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3526                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3527 
3528   Register Dst = MI.getOperand(0).getReg();
3529   Register VData = MI.getOperand(2).getReg();
3530 
3531   Register CmpVal;
3532   int OpOffset = 0;
3533 
3534   if (IsCmpSwap) {
3535     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3536     ++OpOffset;
3537   }
3538 
3539   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3540   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3541 
3542   // The struct intrinsic variants add one additional operand over raw.
3543   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3544   Register VIndex;
3545   if (HasVIndex) {
3546     VIndex = MI.getOperand(4 + OpOffset).getReg();
3547     ++OpOffset;
3548   }
3549 
3550   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3551   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3552   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3553 
3554   MachineMemOperand *MMO = *MI.memoperands_begin();
3555 
3556   unsigned ImmOffset;
3557   unsigned TotalOffset;
3558   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3559   if (TotalOffset != 0)
3560     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3561 
3562   if (!VIndex)
3563     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3564 
3565   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3566     .addDef(Dst)
3567     .addUse(VData); // vdata
3568 
3569   if (IsCmpSwap)
3570     MIB.addReg(CmpVal);
3571 
3572   MIB.addUse(RSrc)               // rsrc
3573      .addUse(VIndex)             // vindex
3574      .addUse(VOffset)            // voffset
3575      .addUse(SOffset)            // soffset
3576      .addImm(ImmOffset)          // offset(imm)
3577      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3578      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3579      .addMemOperand(MMO);
3580 
3581   MI.eraseFromParent();
3582   return true;
3583 }
3584 
3585 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3586 /// vector with s16 typed elements.
3587 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3588                                         SmallVectorImpl<Register> &PackedAddrs,
3589                                         int AddrIdx, int DimIdx, int EndIdx,
3590                                         int NumGradients) {
3591   const LLT S16 = LLT::scalar(16);
3592   const LLT V2S16 = LLT::vector(2, 16);
3593 
3594   for (int I = AddrIdx; I < EndIdx; ++I) {
3595     MachineOperand &SrcOp = MI.getOperand(I);
3596     if (!SrcOp.isReg())
3597       continue; // _L to _LZ may have eliminated this.
3598 
3599     Register AddrReg = SrcOp.getReg();
3600 
3601     if (I < DimIdx) {
3602       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3603       PackedAddrs.push_back(AddrReg);
3604     } else {
3605       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3606       // derivatives dx/dh and dx/dv are packed with undef.
3607       if (((I + 1) >= EndIdx) ||
3608           ((NumGradients / 2) % 2 == 1 &&
3609            (I == DimIdx + (NumGradients / 2) - 1 ||
3610             I == DimIdx + NumGradients - 1)) ||
3611           // Check for _L to _LZ optimization
3612           !MI.getOperand(I + 1).isReg()) {
3613         PackedAddrs.push_back(
3614             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3615                 .getReg(0));
3616       } else {
3617         PackedAddrs.push_back(
3618             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3619                 .getReg(0));
3620         ++I;
3621       }
3622     }
3623   }
3624 }
3625 
3626 /// Convert from separate vaddr components to a single vector address register,
3627 /// and replace the remaining operands with $noreg.
3628 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3629                                      int DimIdx, int NumVAddrs) {
3630   const LLT S32 = LLT::scalar(32);
3631 
3632   SmallVector<Register, 8> AddrRegs;
3633   for (int I = 0; I != NumVAddrs; ++I) {
3634     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3635     if (SrcOp.isReg()) {
3636       AddrRegs.push_back(SrcOp.getReg());
3637       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3638     }
3639   }
3640 
3641   int NumAddrRegs = AddrRegs.size();
3642   if (NumAddrRegs != 1) {
3643     // Round up to 8 elements for v5-v7
3644     // FIXME: Missing intermediate sized register classes and instructions.
3645     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3646       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3647       auto Undef = B.buildUndef(S32);
3648       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3649       NumAddrRegs = RoundedNumRegs;
3650     }
3651 
3652     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3653     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3654   }
3655 
3656   for (int I = 1; I != NumVAddrs; ++I) {
3657     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3658     if (SrcOp.isReg())
3659       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3660   }
3661 }
3662 
3663 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3664 ///
3665 /// Depending on the subtarget, load/store with 16-bit element data need to be
3666 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3667 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3668 /// registers.
3669 ///
3670 /// We don't want to directly select image instructions just yet, but also want
3671 /// to exposes all register repacking to the legalizer/combiners. We also don't
3672 /// want a selected instrution entering RegBankSelect. In order to avoid
3673 /// defining a multitude of intermediate image instructions, directly hack on
3674 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3675 /// now unnecessary arguments with $noreg.
3676 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3677     MachineInstr &MI, MachineIRBuilder &B,
3678     GISelChangeObserver &Observer,
3679     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3680 
3681   const int NumDefs = MI.getNumExplicitDefs();
3682   bool IsTFE = NumDefs == 2;
3683   // We are only processing the operands of d16 image operations on subtargets
3684   // that use the unpacked register layout, or need to repack the TFE result.
3685 
3686   // TODO: Do we need to guard against already legalized intrinsics?
3687   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3688     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3689 
3690   MachineRegisterInfo *MRI = B.getMRI();
3691   const LLT S32 = LLT::scalar(32);
3692   const LLT S16 = LLT::scalar(16);
3693   const LLT V2S16 = LLT::vector(2, 16);
3694 
3695   // Index of first address argument
3696   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3697 
3698   int NumVAddrs, NumGradients;
3699   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3700   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3701     getDMaskIdx(BaseOpcode, NumDefs);
3702   unsigned DMask = 0;
3703 
3704   // Check for 16 bit addresses and pack if true.
3705   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3706   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3707   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3708   const bool IsG16 = GradTy == S16;
3709   const bool IsA16 = AddrTy == S16;
3710 
3711   int DMaskLanes = 0;
3712   if (!BaseOpcode->Atomic) {
3713     DMask = MI.getOperand(DMaskIdx).getImm();
3714     if (BaseOpcode->Gather4) {
3715       DMaskLanes = 4;
3716     } else if (DMask != 0) {
3717       DMaskLanes = countPopulation(DMask);
3718     } else if (!IsTFE && !BaseOpcode->Store) {
3719       // If dmask is 0, this is a no-op load. This can be eliminated.
3720       B.buildUndef(MI.getOperand(0));
3721       MI.eraseFromParent();
3722       return true;
3723     }
3724   }
3725 
3726   Observer.changingInstr(MI);
3727   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3728 
3729   unsigned NewOpcode = NumDefs == 0 ?
3730     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3731 
3732   // Track that we legalized this
3733   MI.setDesc(B.getTII().get(NewOpcode));
3734 
3735   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3736   // dmask to be at least 1 otherwise the instruction will fail
3737   if (IsTFE && DMask == 0) {
3738     DMask = 0x1;
3739     DMaskLanes = 1;
3740     MI.getOperand(DMaskIdx).setImm(DMask);
3741   }
3742 
3743   if (BaseOpcode->Atomic) {
3744     Register VData0 = MI.getOperand(2).getReg();
3745     LLT Ty = MRI->getType(VData0);
3746 
3747     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3748     if (Ty.isVector())
3749       return false;
3750 
3751     if (BaseOpcode->AtomicX2) {
3752       Register VData1 = MI.getOperand(3).getReg();
3753       // The two values are packed in one register.
3754       LLT PackedTy = LLT::vector(2, Ty);
3755       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3756       MI.getOperand(2).setReg(Concat.getReg(0));
3757       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3758     }
3759   }
3760 
3761   int CorrectedNumVAddrs = NumVAddrs;
3762 
3763   // Optimize _L to _LZ when _L is zero
3764   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3765         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3766     const ConstantFP *ConstantLod;
3767     const int LodIdx = AddrIdx + NumVAddrs - 1;
3768 
3769     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3770       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3771         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3772         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3773           LZMappingInfo->LZ, ImageDimIntr->Dim);
3774 
3775         // The starting indexes should remain in the same place.
3776         --NumVAddrs;
3777         --CorrectedNumVAddrs;
3778 
3779         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3780           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3781         MI.RemoveOperand(LodIdx);
3782       }
3783     }
3784   }
3785 
3786   // Optimize _mip away, when 'lod' is zero
3787   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3788     int64_t ConstantLod;
3789     const int LodIdx = AddrIdx + NumVAddrs - 1;
3790 
3791     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3792       if (ConstantLod == 0) {
3793         // TODO: Change intrinsic opcode and remove operand instead or replacing
3794         // it with 0, as the _L to _LZ handling is done above.
3795         MI.getOperand(LodIdx).ChangeToImmediate(0);
3796         --CorrectedNumVAddrs;
3797       }
3798     }
3799   }
3800 
3801   // Rewrite the addressing register layout before doing anything else.
3802   if (IsA16 || IsG16) {
3803     if (IsA16) {
3804       // Target must support the feature and gradients need to be 16 bit too
3805       if (!ST.hasA16() || !IsG16)
3806         return false;
3807     } else if (!ST.hasG16())
3808       return false;
3809 
3810     if (NumVAddrs > 1) {
3811       SmallVector<Register, 4> PackedRegs;
3812       // Don't compress addresses for G16
3813       const int PackEndIdx =
3814           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3815       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3816                                   PackEndIdx, NumGradients);
3817 
3818       if (!IsA16) {
3819         // Add uncompressed address
3820         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3821           int AddrReg = MI.getOperand(I).getReg();
3822           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3823           PackedRegs.push_back(AddrReg);
3824         }
3825       }
3826 
3827       // See also below in the non-a16 branch
3828       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3829 
3830       if (!UseNSA && PackedRegs.size() > 1) {
3831         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3832         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3833         PackedRegs[0] = Concat.getReg(0);
3834         PackedRegs.resize(1);
3835       }
3836 
3837       const int NumPacked = PackedRegs.size();
3838       for (int I = 0; I != NumVAddrs; ++I) {
3839         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3840         if (!SrcOp.isReg()) {
3841           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3842           continue;
3843         }
3844 
3845         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3846 
3847         if (I < NumPacked)
3848           SrcOp.setReg(PackedRegs[I]);
3849         else
3850           SrcOp.setReg(AMDGPU::NoRegister);
3851       }
3852     }
3853   } else {
3854     // If the register allocator cannot place the address registers contiguously
3855     // without introducing moves, then using the non-sequential address encoding
3856     // is always preferable, since it saves VALU instructions and is usually a
3857     // wash in terms of code size or even better.
3858     //
3859     // However, we currently have no way of hinting to the register allocator
3860     // that MIMG addresses should be placed contiguously when it is possible to
3861     // do so, so force non-NSA for the common 2-address case as a heuristic.
3862     //
3863     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3864     // allocation when possible.
3865     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3866 
3867     if (!UseNSA && NumVAddrs > 1)
3868       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3869   }
3870 
3871   int Flags = 0;
3872   if (IsA16)
3873     Flags |= 1;
3874   if (IsG16)
3875     Flags |= 2;
3876   MI.addOperand(MachineOperand::CreateImm(Flags));
3877 
3878   if (BaseOpcode->Store) { // No TFE for stores?
3879     // TODO: Handle dmask trim
3880     Register VData = MI.getOperand(1).getReg();
3881     LLT Ty = MRI->getType(VData);
3882     if (!Ty.isVector() || Ty.getElementType() != S16)
3883       return true;
3884 
3885     Register RepackedReg = handleD16VData(B, *MRI, VData);
3886     if (RepackedReg != VData) {
3887       MI.getOperand(1).setReg(RepackedReg);
3888     }
3889 
3890     return true;
3891   }
3892 
3893   Register DstReg = MI.getOperand(0).getReg();
3894   LLT Ty = MRI->getType(DstReg);
3895   const LLT EltTy = Ty.getScalarType();
3896   const bool IsD16 = Ty.getScalarType() == S16;
3897   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3898 
3899   // Confirm that the return type is large enough for the dmask specified
3900   if (NumElts < DMaskLanes)
3901     return false;
3902 
3903   if (NumElts > 4 || DMaskLanes > 4)
3904     return false;
3905 
3906   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3907   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3908 
3909   // The raw dword aligned data component of the load. The only legal cases
3910   // where this matters should be when using the packed D16 format, for
3911   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3912   LLT RoundedTy;
3913 
3914   // S32 vector to to cover all data, plus TFE result element.
3915   LLT TFETy;
3916 
3917   // Register type to use for each loaded component. Will be S32 or V2S16.
3918   LLT RegTy;
3919 
3920   if (IsD16 && ST.hasUnpackedD16VMem()) {
3921     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3922     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3923     RegTy = S32;
3924   } else {
3925     unsigned EltSize = EltTy.getSizeInBits();
3926     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3927     unsigned RoundedSize = 32 * RoundedElts;
3928     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3929     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3930     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3931   }
3932 
3933   // The return type does not need adjustment.
3934   // TODO: Should we change s16 case to s32 or <2 x s16>?
3935   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3936     return true;
3937 
3938   Register Dst1Reg;
3939 
3940   // Insert after the instruction.
3941   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3942 
3943   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3944   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3945   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3946   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3947 
3948   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3949 
3950   MI.getOperand(0).setReg(NewResultReg);
3951 
3952   // In the IR, TFE is supposed to be used with a 2 element struct return
3953   // type. The intruction really returns these two values in one contiguous
3954   // register, with one additional dword beyond the loaded data. Rewrite the
3955   // return type to use a single register result.
3956 
3957   if (IsTFE) {
3958     Dst1Reg = MI.getOperand(1).getReg();
3959     if (MRI->getType(Dst1Reg) != S32)
3960       return false;
3961 
3962     // TODO: Make sure the TFE operand bit is set.
3963     MI.RemoveOperand(1);
3964 
3965     // Handle the easy case that requires no repack instructions.
3966     if (Ty == S32) {
3967       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3968       return true;
3969     }
3970   }
3971 
3972   // Now figure out how to copy the new result register back into the old
3973   // result.
3974   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3975 
3976   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3977 
3978   if (ResultNumRegs == 1) {
3979     assert(!IsTFE);
3980     ResultRegs[0] = NewResultReg;
3981   } else {
3982     // We have to repack into a new vector of some kind.
3983     for (int I = 0; I != NumDataRegs; ++I)
3984       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3985     B.buildUnmerge(ResultRegs, NewResultReg);
3986 
3987     // Drop the final TFE element to get the data part. The TFE result is
3988     // directly written to the right place already.
3989     if (IsTFE)
3990       ResultRegs.resize(NumDataRegs);
3991   }
3992 
3993   // For an s16 scalar result, we form an s32 result with a truncate regardless
3994   // of packed vs. unpacked.
3995   if (IsD16 && !Ty.isVector()) {
3996     B.buildTrunc(DstReg, ResultRegs[0]);
3997     return true;
3998   }
3999 
4000   // Avoid a build/concat_vector of 1 entry.
4001   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4002     B.buildBitcast(DstReg, ResultRegs[0]);
4003     return true;
4004   }
4005 
4006   assert(Ty.isVector());
4007 
4008   if (IsD16) {
4009     // For packed D16 results with TFE enabled, all the data components are
4010     // S32. Cast back to the expected type.
4011     //
4012     // TODO: We don't really need to use load s32 elements. We would only need one
4013     // cast for the TFE result if a multiple of v2s16 was used.
4014     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4015       for (Register &Reg : ResultRegs)
4016         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4017     } else if (ST.hasUnpackedD16VMem()) {
4018       for (Register &Reg : ResultRegs)
4019         Reg = B.buildTrunc(S16, Reg).getReg(0);
4020     }
4021   }
4022 
4023   auto padWithUndef = [&](LLT Ty, int NumElts) {
4024     if (NumElts == 0)
4025       return;
4026     Register Undef = B.buildUndef(Ty).getReg(0);
4027     for (int I = 0; I != NumElts; ++I)
4028       ResultRegs.push_back(Undef);
4029   };
4030 
4031   // Pad out any elements eliminated due to the dmask.
4032   LLT ResTy = MRI->getType(ResultRegs[0]);
4033   if (!ResTy.isVector()) {
4034     padWithUndef(ResTy, NumElts - ResultRegs.size());
4035     B.buildBuildVector(DstReg, ResultRegs);
4036     return true;
4037   }
4038 
4039   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4040   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4041 
4042   // Deal with the one annoying legal case.
4043   const LLT V3S16 = LLT::vector(3, 16);
4044   if (Ty == V3S16) {
4045     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4046     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4047     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4048     return true;
4049   }
4050 
4051   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4052   B.buildConcatVectors(DstReg, ResultRegs);
4053   return true;
4054 }
4055 
4056 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4057   MachineInstr &MI, MachineIRBuilder &B,
4058   GISelChangeObserver &Observer) const {
4059   Register Dst = MI.getOperand(0).getReg();
4060   LLT Ty = B.getMRI()->getType(Dst);
4061   unsigned Size = Ty.getSizeInBits();
4062   MachineFunction &MF = B.getMF();
4063 
4064   Observer.changingInstr(MI);
4065 
4066   // FIXME: We don't really need this intermediate instruction. The intrinsic
4067   // should be fixed to have a memory operand. Since it's readnone, we're not
4068   // allowed to add one.
4069   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4070   MI.RemoveOperand(1); // Remove intrinsic ID
4071 
4072   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4073   // TODO: Should this use datalayout alignment?
4074   const unsigned MemSize = (Size + 7) / 8;
4075   const Align MemAlign(4);
4076   MachineMemOperand *MMO = MF.getMachineMemOperand(
4077       MachinePointerInfo(),
4078       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4079           MachineMemOperand::MOInvariant,
4080       MemSize, MemAlign);
4081   MI.addMemOperand(MF, MMO);
4082 
4083   // There are no 96-bit result scalar loads, but widening to 128-bit should
4084   // always be legal. We may need to restore this to a 96-bit result if it turns
4085   // out this needs to be converted to a vector load during RegBankSelect.
4086   if (!isPowerOf2_32(Size)) {
4087     LegalizerHelper Helper(MF, *this, Observer, B);
4088 
4089     if (Ty.isVector())
4090       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4091     else
4092       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4093   }
4094 
4095   Observer.changedInstr(MI);
4096   return true;
4097 }
4098 
4099 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4100                                                 MachineRegisterInfo &MRI,
4101                                                 MachineIRBuilder &B) const {
4102   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4103   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4104       !ST.isTrapHandlerEnabled()) {
4105     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4106   } else {
4107     // Pass queue pointer to trap handler as input, and insert trap instruction
4108     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4109     const ArgDescriptor *Arg =
4110         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4111     if (!Arg)
4112       return false;
4113     MachineRegisterInfo &MRI = *B.getMRI();
4114     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4115     Register LiveIn = getLiveInRegister(
4116         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4117         /*InsertLiveInCopy=*/false);
4118     if (!loadInputValue(LiveIn, B, Arg))
4119       return false;
4120     B.buildCopy(SGPR01, LiveIn);
4121     B.buildInstr(AMDGPU::S_TRAP)
4122         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4123         .addReg(SGPR01, RegState::Implicit);
4124   }
4125 
4126   MI.eraseFromParent();
4127   return true;
4128 }
4129 
4130 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4131     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4132   // Is non-HSA path or trap-handler disabled? then, report a warning
4133   // accordingly
4134   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4135       !ST.isTrapHandlerEnabled()) {
4136     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4137                                      "debugtrap handler not supported",
4138                                      MI.getDebugLoc(), DS_Warning);
4139     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4140     Ctx.diagnose(NoTrap);
4141   } else {
4142     // Insert debug-trap instruction
4143     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4144   }
4145 
4146   MI.eraseFromParent();
4147   return true;
4148 }
4149 
4150 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4151                                             MachineInstr &MI) const {
4152   MachineIRBuilder &B = Helper.MIRBuilder;
4153   MachineRegisterInfo &MRI = *B.getMRI();
4154 
4155   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4156   auto IntrID = MI.getIntrinsicID();
4157   switch (IntrID) {
4158   case Intrinsic::amdgcn_if:
4159   case Intrinsic::amdgcn_else: {
4160     MachineInstr *Br = nullptr;
4161     MachineBasicBlock *UncondBrTarget = nullptr;
4162     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4163       const SIRegisterInfo *TRI
4164         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4165 
4166       Register Def = MI.getOperand(1).getReg();
4167       Register Use = MI.getOperand(3).getReg();
4168 
4169       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4170       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4171       if (IntrID == Intrinsic::amdgcn_if) {
4172         B.buildInstr(AMDGPU::SI_IF)
4173           .addDef(Def)
4174           .addUse(Use)
4175           .addMBB(UncondBrTarget);
4176       } else {
4177         B.buildInstr(AMDGPU::SI_ELSE)
4178           .addDef(Def)
4179           .addUse(Use)
4180           .addMBB(UncondBrTarget)
4181           .addImm(0);
4182       }
4183 
4184       if (Br) {
4185         Br->getOperand(0).setMBB(CondBrTarget);
4186       } else {
4187         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4188         // since we're swapping branch targets it needs to be reinserted.
4189         // FIXME: IRTranslator should probably not do this
4190         B.buildBr(*CondBrTarget);
4191       }
4192 
4193       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4194       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4195       MI.eraseFromParent();
4196       BrCond->eraseFromParent();
4197       return true;
4198     }
4199 
4200     return false;
4201   }
4202   case Intrinsic::amdgcn_loop: {
4203     MachineInstr *Br = nullptr;
4204     MachineBasicBlock *UncondBrTarget = nullptr;
4205     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4206       const SIRegisterInfo *TRI
4207         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4208 
4209       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4210       Register Reg = MI.getOperand(2).getReg();
4211 
4212       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4213       B.buildInstr(AMDGPU::SI_LOOP)
4214         .addUse(Reg)
4215         .addMBB(UncondBrTarget);
4216 
4217       if (Br)
4218         Br->getOperand(0).setMBB(CondBrTarget);
4219       else
4220         B.buildBr(*CondBrTarget);
4221 
4222       MI.eraseFromParent();
4223       BrCond->eraseFromParent();
4224       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4225       return true;
4226     }
4227 
4228     return false;
4229   }
4230   case Intrinsic::amdgcn_kernarg_segment_ptr:
4231     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4232       // This only makes sense to call in a kernel, so just lower to null.
4233       B.buildConstant(MI.getOperand(0).getReg(), 0);
4234       MI.eraseFromParent();
4235       return true;
4236     }
4237 
4238     return legalizePreloadedArgIntrin(
4239       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4240   case Intrinsic::amdgcn_implicitarg_ptr:
4241     return legalizeImplicitArgPtr(MI, MRI, B);
4242   case Intrinsic::amdgcn_workitem_id_x:
4243     return legalizePreloadedArgIntrin(MI, MRI, B,
4244                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4245   case Intrinsic::amdgcn_workitem_id_y:
4246     return legalizePreloadedArgIntrin(MI, MRI, B,
4247                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4248   case Intrinsic::amdgcn_workitem_id_z:
4249     return legalizePreloadedArgIntrin(MI, MRI, B,
4250                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4251   case Intrinsic::amdgcn_workgroup_id_x:
4252     return legalizePreloadedArgIntrin(MI, MRI, B,
4253                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4254   case Intrinsic::amdgcn_workgroup_id_y:
4255     return legalizePreloadedArgIntrin(MI, MRI, B,
4256                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4257   case Intrinsic::amdgcn_workgroup_id_z:
4258     return legalizePreloadedArgIntrin(MI, MRI, B,
4259                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4260   case Intrinsic::amdgcn_dispatch_ptr:
4261     return legalizePreloadedArgIntrin(MI, MRI, B,
4262                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4263   case Intrinsic::amdgcn_queue_ptr:
4264     return legalizePreloadedArgIntrin(MI, MRI, B,
4265                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4266   case Intrinsic::amdgcn_implicit_buffer_ptr:
4267     return legalizePreloadedArgIntrin(
4268       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4269   case Intrinsic::amdgcn_dispatch_id:
4270     return legalizePreloadedArgIntrin(MI, MRI, B,
4271                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4272   case Intrinsic::amdgcn_fdiv_fast:
4273     return legalizeFDIVFastIntrin(MI, MRI, B);
4274   case Intrinsic::amdgcn_is_shared:
4275     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4276   case Intrinsic::amdgcn_is_private:
4277     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4278   case Intrinsic::amdgcn_wavefrontsize: {
4279     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4280     MI.eraseFromParent();
4281     return true;
4282   }
4283   case Intrinsic::amdgcn_s_buffer_load:
4284     return legalizeSBufferLoad(MI, B, Helper.Observer);
4285   case Intrinsic::amdgcn_raw_buffer_store:
4286   case Intrinsic::amdgcn_struct_buffer_store:
4287     return legalizeBufferStore(MI, MRI, B, false, false);
4288   case Intrinsic::amdgcn_raw_buffer_store_format:
4289   case Intrinsic::amdgcn_struct_buffer_store_format:
4290     return legalizeBufferStore(MI, MRI, B, false, true);
4291   case Intrinsic::amdgcn_raw_tbuffer_store:
4292   case Intrinsic::amdgcn_struct_tbuffer_store:
4293     return legalizeBufferStore(MI, MRI, B, true, true);
4294   case Intrinsic::amdgcn_raw_buffer_load:
4295   case Intrinsic::amdgcn_struct_buffer_load:
4296     return legalizeBufferLoad(MI, MRI, B, false, false);
4297   case Intrinsic::amdgcn_raw_buffer_load_format:
4298   case Intrinsic::amdgcn_struct_buffer_load_format:
4299     return legalizeBufferLoad(MI, MRI, B, true, false);
4300   case Intrinsic::amdgcn_raw_tbuffer_load:
4301   case Intrinsic::amdgcn_struct_tbuffer_load:
4302     return legalizeBufferLoad(MI, MRI, B, true, true);
4303   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4304   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4305   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4306   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4307   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4308   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4309   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4310   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4311   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4312   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4313   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4314   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4315   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4316   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4317   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4318   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4319   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4320   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4321   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4322   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4323   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4324   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4325   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4326   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4327   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4328   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4329     return legalizeBufferAtomic(MI, B, IntrID);
4330   case Intrinsic::amdgcn_atomic_inc:
4331     return legalizeAtomicIncDec(MI, B, true);
4332   case Intrinsic::amdgcn_atomic_dec:
4333     return legalizeAtomicIncDec(MI, B, false);
4334   case Intrinsic::trap:
4335     return legalizeTrapIntrinsic(MI, MRI, B);
4336   case Intrinsic::debugtrap:
4337     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4338   default: {
4339     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4340             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4341       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4342     return true;
4343   }
4344   }
4345 
4346   return true;
4347 }
4348