1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/TargetOpcodes.h"
21 #include "llvm/CodeGen/ValueTypes.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/Support/Debug.h"
25 
26 #define DEBUG_TYPE "amdgpu-legalinfo"
27 
28 using namespace llvm;
29 using namespace LegalizeActions;
30 using namespace LegalizeMutations;
31 using namespace LegalityPredicates;
32 
33 
34 static LegalityPredicate isMultiple32(unsigned TypeIdx,
35                                       unsigned MaxSize = 512) {
36   return [=](const LegalityQuery &Query) {
37     const LLT Ty = Query.Types[TypeIdx];
38     const LLT EltTy = Ty.getScalarType();
39     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
40   };
41 }
42 
43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     return Ty.isVector() &&
47            Ty.getNumElements() % 2 != 0 &&
48            Ty.getElementType().getSizeInBits() < 32;
49   };
50 }
51 
52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
53   return [=](const LegalityQuery &Query) {
54     const LLT Ty = Query.Types[TypeIdx];
55     const LLT EltTy = Ty.getElementType();
56     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
57   };
58 }
59 
60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getElementType();
64     unsigned Size = Ty.getSizeInBits();
65     unsigned Pieces = (Size + 63) / 64;
66     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
67     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
68   };
69 }
70 
71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
72   return [=](const LegalityQuery &Query) {
73     const LLT QueryTy = Query.Types[TypeIdx];
74     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
75   };
76 }
77 
78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
79   return [=](const LegalityQuery &Query) {
80     const LLT QueryTy = Query.Types[TypeIdx];
81     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
82   };
83 }
84 
85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
86 // v2s16.
87 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     if (Ty.isVector()) {
91       const int EltSize = Ty.getElementType().getSizeInBits();
92       return EltSize == 32 || EltSize == 64 ||
93             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
94              EltSize == 128 || EltSize == 256;
95     }
96 
97     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
98   };
99 }
100 
101 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
102   return [=](const LegalityQuery &Query) {
103     return Query.Types[TypeIdx].getElementType() == Type;
104   };
105 }
106 
107 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
108                                          const GCNTargetMachine &TM)
109   :  ST(ST_) {
110   using namespace TargetOpcode;
111 
112   auto GetAddrSpacePtr = [&TM](unsigned AS) {
113     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
114   };
115 
116   const LLT S1 = LLT::scalar(1);
117   const LLT S8 = LLT::scalar(8);
118   const LLT S16 = LLT::scalar(16);
119   const LLT S32 = LLT::scalar(32);
120   const LLT S64 = LLT::scalar(64);
121   const LLT S128 = LLT::scalar(128);
122   const LLT S256 = LLT::scalar(256);
123   const LLT S512 = LLT::scalar(512);
124 
125   const LLT V2S16 = LLT::vector(2, 16);
126   const LLT V4S16 = LLT::vector(4, 16);
127 
128   const LLT V2S32 = LLT::vector(2, 32);
129   const LLT V3S32 = LLT::vector(3, 32);
130   const LLT V4S32 = LLT::vector(4, 32);
131   const LLT V5S32 = LLT::vector(5, 32);
132   const LLT V6S32 = LLT::vector(6, 32);
133   const LLT V7S32 = LLT::vector(7, 32);
134   const LLT V8S32 = LLT::vector(8, 32);
135   const LLT V9S32 = LLT::vector(9, 32);
136   const LLT V10S32 = LLT::vector(10, 32);
137   const LLT V11S32 = LLT::vector(11, 32);
138   const LLT V12S32 = LLT::vector(12, 32);
139   const LLT V13S32 = LLT::vector(13, 32);
140   const LLT V14S32 = LLT::vector(14, 32);
141   const LLT V15S32 = LLT::vector(15, 32);
142   const LLT V16S32 = LLT::vector(16, 32);
143 
144   const LLT V2S64 = LLT::vector(2, 64);
145   const LLT V3S64 = LLT::vector(3, 64);
146   const LLT V4S64 = LLT::vector(4, 64);
147   const LLT V5S64 = LLT::vector(5, 64);
148   const LLT V6S64 = LLT::vector(6, 64);
149   const LLT V7S64 = LLT::vector(7, 64);
150   const LLT V8S64 = LLT::vector(8, 64);
151 
152   std::initializer_list<LLT> AllS32Vectors =
153     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
154      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
155   std::initializer_list<LLT> AllS64Vectors =
156     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
157 
158   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
159   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
160   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
161   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
162   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
163   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
164   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
165 
166   const LLT CodePtr = FlatPtr;
167 
168   const std::initializer_list<LLT> AddrSpaces64 = {
169     GlobalPtr, ConstantPtr, FlatPtr
170   };
171 
172   const std::initializer_list<LLT> AddrSpaces32 = {
173     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
174   };
175 
176   const std::initializer_list<LLT> FPTypesBase = {
177     S32, S64
178   };
179 
180   const std::initializer_list<LLT> FPTypes16 = {
181     S32, S64, S16
182   };
183 
184   const std::initializer_list<LLT> FPTypesPK16 = {
185     S32, S64, S16, V2S16
186   };
187 
188   setAction({G_BRCOND, S1}, Legal);
189 
190   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
191   // elements for v3s16
192   getActionDefinitionsBuilder(G_PHI)
193     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
194     .legalFor(AllS32Vectors)
195     .legalFor(AllS64Vectors)
196     .legalFor(AddrSpaces64)
197     .legalFor(AddrSpaces32)
198     .clampScalar(0, S32, S256)
199     .widenScalarToNextPow2(0, 32)
200     .clampMaxNumElements(0, S32, 16)
201     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
202     .legalIf(isPointer(0));
203 
204   if (ST.has16BitInsts()) {
205     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
206       .legalFor({S32, S16})
207       .clampScalar(0, S16, S32)
208       .scalarize(0);
209   } else {
210     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
211       .legalFor({S32})
212       .clampScalar(0, S32, S32)
213       .scalarize(0);
214   }
215 
216   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
217     .legalFor({S32})
218     .clampScalar(0, S32, S32)
219     .scalarize(0);
220 
221   // Report legal for any types we can handle anywhere. For the cases only legal
222   // on the SALU, RegBankSelect will be able to re-legalize.
223   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
224     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
225     .clampScalar(0, S32, S64)
226     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
227     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
228     .widenScalarToNextPow2(0)
229     .scalarize(0);
230 
231   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
232                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
233     .legalFor({{S32, S1}})
234     .clampScalar(0, S32, S32);
235 
236   getActionDefinitionsBuilder(G_BITCAST)
237     .legalForCartesianProduct({S32, V2S16})
238     .legalForCartesianProduct({S64, V2S32, V4S16})
239     .legalForCartesianProduct({V2S64, V4S32})
240     // Don't worry about the size constraint.
241     .legalIf(all(isPointer(0), isPointer(1)));
242 
243   if (ST.has16BitInsts()) {
244     getActionDefinitionsBuilder(G_FCONSTANT)
245       .legalFor({S32, S64, S16})
246       .clampScalar(0, S16, S64);
247   } else {
248     getActionDefinitionsBuilder(G_FCONSTANT)
249       .legalFor({S32, S64})
250       .clampScalar(0, S32, S64);
251   }
252 
253   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
254     .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
255                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
256     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
257     .clampScalarOrElt(0, S32, S512)
258     .legalIf(isMultiple32(0))
259     .widenScalarToNextPow2(0, 32)
260     .clampMaxNumElements(0, S32, 16);
261 
262 
263   // FIXME: i1 operands to intrinsics should always be legal, but other i1
264   // values may not be legal.  We need to figure out how to distinguish
265   // between these two scenarios.
266   getActionDefinitionsBuilder(G_CONSTANT)
267     .legalFor({S1, S32, S64, GlobalPtr,
268                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
269     .clampScalar(0, S32, S64)
270     .widenScalarToNextPow2(0)
271     .legalIf(isPointer(0));
272 
273   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
274 
275   auto &FPOpActions = getActionDefinitionsBuilder(
276     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
277     .legalFor({S32, S64});
278 
279   if (ST.has16BitInsts()) {
280     if (ST.hasVOP3PInsts())
281       FPOpActions.legalFor({S16, V2S16});
282     else
283       FPOpActions.legalFor({S16});
284   }
285 
286   auto &MinNumMaxNum = getActionDefinitionsBuilder({
287       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
288 
289   if (ST.hasVOP3PInsts()) {
290     MinNumMaxNum.customFor(FPTypesPK16)
291       .clampMaxNumElements(0, S16, 2)
292       .clampScalar(0, S16, S64)
293       .scalarize(0);
294   } else if (ST.has16BitInsts()) {
295     MinNumMaxNum.customFor(FPTypes16)
296       .clampScalar(0, S16, S64)
297       .scalarize(0);
298   } else {
299     MinNumMaxNum.customFor(FPTypesBase)
300       .clampScalar(0, S32, S64)
301       .scalarize(0);
302   }
303 
304   // TODO: Implement
305   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
306 
307   if (ST.hasVOP3PInsts())
308     FPOpActions.clampMaxNumElements(0, S16, 2);
309   FPOpActions
310     .scalarize(0)
311     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
312 
313   if (ST.has16BitInsts()) {
314     getActionDefinitionsBuilder(G_FSQRT)
315       .legalFor({S32, S64, S16})
316       .scalarize(0)
317       .clampScalar(0, S16, S64);
318   } else {
319     getActionDefinitionsBuilder(G_FSQRT)
320       .legalFor({S32, S64})
321       .scalarize(0)
322       .clampScalar(0, S32, S64);
323   }
324 
325   getActionDefinitionsBuilder(G_FPTRUNC)
326     .legalFor({{S32, S64}, {S16, S32}})
327     .scalarize(0);
328 
329   getActionDefinitionsBuilder(G_FPEXT)
330     .legalFor({{S64, S32}, {S32, S16}})
331     .lowerFor({{S64, S16}}) // FIXME: Implement
332     .scalarize(0);
333 
334   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
335   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
336 
337   getActionDefinitionsBuilder(G_FSUB)
338       // Use actual fsub instruction
339       .legalFor({S32})
340       // Must use fadd + fneg
341       .lowerFor({S64, S16, V2S16})
342       .scalarize(0)
343       .clampScalar(0, S32, S64);
344 
345   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
346     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
347                {S32, S1}, {S64, S1}, {S16, S1},
348                // FIXME: Hack
349                {S64, LLT::scalar(33)},
350                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
351     .scalarize(0);
352 
353   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
354     .legalFor({{S32, S32}, {S64, S32}})
355     .lowerFor({{S32, S64}})
356     .customFor({{S64, S64}})
357     .scalarize(0);
358 
359   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
360     .legalFor({{S32, S32}, {S32, S64}})
361     .scalarize(0);
362 
363   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
364     .legalFor({S32, S64})
365     .scalarize(0);
366 
367   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
368     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
369       .legalFor({S32, S64})
370       .clampScalar(0, S32, S64)
371       .scalarize(0);
372   } else {
373     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
374       .legalFor({S32})
375       .customFor({S64})
376       .clampScalar(0, S32, S64)
377       .scalarize(0);
378   }
379 
380   getActionDefinitionsBuilder(G_GEP)
381     .legalForCartesianProduct(AddrSpaces64, {S64})
382     .legalForCartesianProduct(AddrSpaces32, {S32})
383     .scalarize(0);
384 
385   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
386 
387   auto &CmpBuilder =
388     getActionDefinitionsBuilder(G_ICMP)
389     .legalForCartesianProduct(
390       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
391     .legalFor({{S1, S32}, {S1, S64}});
392   if (ST.has16BitInsts()) {
393     CmpBuilder.legalFor({{S1, S16}});
394   }
395 
396   CmpBuilder
397     .widenScalarToNextPow2(1)
398     .clampScalar(1, S32, S64)
399     .scalarize(0)
400     .legalIf(all(typeIs(0, S1), isPointer(1)));
401 
402   getActionDefinitionsBuilder(G_FCMP)
403     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
404     .widenScalarToNextPow2(1)
405     .clampScalar(1, S32, S64)
406     .scalarize(0);
407 
408   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
409   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
410                                G_FLOG, G_FLOG2, G_FLOG10})
411     .legalFor({S32})
412     .scalarize(0);
413 
414   // The 64-bit versions produce 32-bit results, but only on the SALU.
415   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
416                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
417                                G_CTPOP})
418     .legalFor({{S32, S32}, {S32, S64}})
419     .clampScalar(0, S32, S32)
420     .clampScalar(1, S32, S64)
421     .scalarize(0)
422     .widenScalarToNextPow2(0, 32)
423     .widenScalarToNextPow2(1, 32);
424 
425   // TODO: Expand for > s32
426   getActionDefinitionsBuilder(G_BSWAP)
427     .legalFor({S32})
428     .clampScalar(0, S32, S32)
429     .scalarize(0);
430 
431   if (ST.has16BitInsts()) {
432     if (ST.hasVOP3PInsts()) {
433       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
434         .legalFor({S32, S16, V2S16})
435         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
436         .clampMaxNumElements(0, S16, 2)
437         .clampScalar(0, S16, S32)
438         .widenScalarToNextPow2(0)
439         .scalarize(0);
440     } else {
441       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
442         .legalFor({S32, S16})
443         .widenScalarToNextPow2(0)
444         .clampScalar(0, S16, S32)
445         .scalarize(0);
446     }
447   } else {
448     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
449       .legalFor({S32})
450       .clampScalar(0, S32, S32)
451       .widenScalarToNextPow2(0)
452       .scalarize(0);
453   }
454 
455   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
456     return [=](const LegalityQuery &Query) {
457       return Query.Types[TypeIdx0].getSizeInBits() <
458              Query.Types[TypeIdx1].getSizeInBits();
459     };
460   };
461 
462   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
463     return [=](const LegalityQuery &Query) {
464       return Query.Types[TypeIdx0].getSizeInBits() >
465              Query.Types[TypeIdx1].getSizeInBits();
466     };
467   };
468 
469   getActionDefinitionsBuilder(G_INTTOPTR)
470     // List the common cases
471     .legalForCartesianProduct(AddrSpaces64, {S64})
472     .legalForCartesianProduct(AddrSpaces32, {S32})
473     .scalarize(0)
474     // Accept any address space as long as the size matches
475     .legalIf(sameSize(0, 1))
476     .widenScalarIf(smallerThan(1, 0),
477       [](const LegalityQuery &Query) {
478         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
479       })
480     .narrowScalarIf(greaterThan(1, 0),
481       [](const LegalityQuery &Query) {
482         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
483       });
484 
485   getActionDefinitionsBuilder(G_PTRTOINT)
486     // List the common cases
487     .legalForCartesianProduct(AddrSpaces64, {S64})
488     .legalForCartesianProduct(AddrSpaces32, {S32})
489     .scalarize(0)
490     // Accept any address space as long as the size matches
491     .legalIf(sameSize(0, 1))
492     .widenScalarIf(smallerThan(0, 1),
493       [](const LegalityQuery &Query) {
494         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
495       })
496     .narrowScalarIf(
497       greaterThan(0, 1),
498       [](const LegalityQuery &Query) {
499         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
500       });
501 
502   if (ST.hasFlatAddressSpace()) {
503     getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
504       .scalarize(0)
505       .custom();
506   }
507 
508   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
509   // handle some operations by just promoting the register during
510   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
511   getActionDefinitionsBuilder({G_LOAD, G_STORE})
512     .narrowScalarIf([](const LegalityQuery &Query) {
513         unsigned Size = Query.Types[0].getSizeInBits();
514         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
515         return (Size > 32 && MemSize < Size);
516       },
517       [](const LegalityQuery &Query) {
518         return std::make_pair(0, LLT::scalar(32));
519       })
520     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
521     .fewerElementsIf([=](const LegalityQuery &Query) {
522         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
523         return (MemSize == 96) &&
524                Query.Types[0].isVector() &&
525                !ST.hasDwordx3LoadStores();
526       },
527       [=](const LegalityQuery &Query) {
528         return std::make_pair(0, V2S32);
529       })
530     .legalIf([=](const LegalityQuery &Query) {
531         const LLT &Ty0 = Query.Types[0];
532 
533         unsigned Size = Ty0.getSizeInBits();
534         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
535         if (Size < 32 || (Size > 32 && MemSize < Size))
536           return false;
537 
538         if (Ty0.isVector() && Size != MemSize)
539           return false;
540 
541         // TODO: Decompose private loads into 4-byte components.
542         // TODO: Illegal flat loads on SI
543         switch (MemSize) {
544         case 8:
545         case 16:
546           return Size == 32;
547         case 32:
548         case 64:
549         case 128:
550           return true;
551 
552         case 96:
553           return ST.hasDwordx3LoadStores();
554 
555         case 256:
556         case 512:
557           // TODO: Possibly support loads of i256 and i512 .  This will require
558           // adding i256 and i512 types to MVT in order for to be able to use
559           // TableGen.
560           // TODO: Add support for other vector types, this will require
561           //       defining more value mappings for the new types.
562           return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
563                                     Ty0.getScalarType().getSizeInBits() == 64);
564 
565         default:
566           return false;
567         }
568       })
569     .clampScalar(0, S32, S64);
570 
571 
572   // FIXME: Handle alignment requirements.
573   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
574     .legalForTypesWithMemDesc({
575         {S32, GlobalPtr, 8, 8},
576         {S32, GlobalPtr, 16, 8},
577         {S32, LocalPtr, 8, 8},
578         {S32, LocalPtr, 16, 8},
579         {S32, PrivatePtr, 8, 8},
580         {S32, PrivatePtr, 16, 8}});
581   if (ST.hasFlatAddressSpace()) {
582     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
583                                        {S32, FlatPtr, 16, 8}});
584   }
585 
586   ExtLoads.clampScalar(0, S32, S32)
587           .widenScalarToNextPow2(0)
588           .unsupportedIfMemSizeNotPow2()
589           .lower();
590 
591   auto &Atomics = getActionDefinitionsBuilder(
592     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
593      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
594      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
595      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
596     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
597                {S64, GlobalPtr}, {S64, LocalPtr}});
598   if (ST.hasFlatAddressSpace()) {
599     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
600   }
601 
602   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
603     .legalFor({{S32, LocalPtr}});
604 
605   // TODO: Pointer types, any 32-bit or 64-bit vector
606   getActionDefinitionsBuilder(G_SELECT)
607     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
608           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
609           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
610     .clampScalar(0, S16, S64)
611     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
612     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
613     .scalarize(1)
614     .clampMaxNumElements(0, S32, 2)
615     .clampMaxNumElements(0, LocalPtr, 2)
616     .clampMaxNumElements(0, PrivatePtr, 2)
617     .scalarize(0)
618     .widenScalarToNextPow2(0)
619     .legalIf(all(isPointer(0), typeIs(1, S1)));
620 
621   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
622   // be more flexible with the shift amount type.
623   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
624     .legalFor({{S32, S32}, {S64, S32}});
625   if (ST.has16BitInsts()) {
626     if (ST.hasVOP3PInsts()) {
627       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
628             .clampMaxNumElements(0, S16, 2);
629     } else
630       Shifts.legalFor({{S16, S32}, {S16, S16}});
631 
632     Shifts.clampScalar(1, S16, S32);
633     Shifts.clampScalar(0, S16, S64);
634     Shifts.widenScalarToNextPow2(0, 16);
635   } else {
636     // Make sure we legalize the shift amount type first, as the general
637     // expansion for the shifted type will produce much worse code if it hasn't
638     // been truncated already.
639     Shifts.clampScalar(1, S32, S32);
640     Shifts.clampScalar(0, S32, S64);
641     Shifts.widenScalarToNextPow2(0, 32);
642   }
643   Shifts.scalarize(0);
644 
645   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
646     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
647     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
648     unsigned IdxTypeIdx = 2;
649 
650     getActionDefinitionsBuilder(Op)
651       .customIf([=](const LegalityQuery &Query) {
652           const LLT EltTy = Query.Types[EltTypeIdx];
653           const LLT VecTy = Query.Types[VecTypeIdx];
654           const LLT IdxTy = Query.Types[IdxTypeIdx];
655           return (EltTy.getSizeInBits() == 16 ||
656                   EltTy.getSizeInBits() % 32 == 0) &&
657                  VecTy.getSizeInBits() % 32 == 0 &&
658                  VecTy.getSizeInBits() <= 512 &&
659                  IdxTy.getSizeInBits() == 32;
660         })
661       .clampScalar(EltTypeIdx, S32, S64)
662       .clampScalar(VecTypeIdx, S32, S64)
663       .clampScalar(IdxTypeIdx, S32, S32);
664   }
665 
666   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
667     .unsupportedIf([=](const LegalityQuery &Query) {
668         const LLT &EltTy = Query.Types[1].getElementType();
669         return Query.Types[0] != EltTy;
670       });
671 
672   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
673     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
674     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
675 
676     // FIXME: Doesn't handle extract of illegal sizes.
677     getActionDefinitionsBuilder(Op)
678       .legalIf([=](const LegalityQuery &Query) {
679           const LLT BigTy = Query.Types[BigTyIdx];
680           const LLT LitTy = Query.Types[LitTyIdx];
681           return (BigTy.getSizeInBits() % 32 == 0) &&
682                  (LitTy.getSizeInBits() % 16 == 0);
683         })
684       .widenScalarIf(
685         [=](const LegalityQuery &Query) {
686           const LLT BigTy = Query.Types[BigTyIdx];
687           return (BigTy.getScalarSizeInBits() < 16);
688         },
689         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
690       .widenScalarIf(
691         [=](const LegalityQuery &Query) {
692           const LLT LitTy = Query.Types[LitTyIdx];
693           return (LitTy.getScalarSizeInBits() < 16);
694         },
695         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
696       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
697       .widenScalarToNextPow2(BigTyIdx, 32);
698 
699   }
700 
701   getActionDefinitionsBuilder(G_BUILD_VECTOR)
702       .legalForCartesianProduct(AllS32Vectors, {S32})
703       .legalForCartesianProduct(AllS64Vectors, {S64})
704       .clampNumElements(0, V16S32, V16S32)
705       .clampNumElements(0, V2S64, V8S64)
706       .minScalarSameAs(1, 0)
707       .legalIf(isRegisterType(0))
708       .minScalarOrElt(0, S32);
709 
710   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
711     .legalIf(isRegisterType(0));
712 
713   // TODO: Don't fully scalarize v2s16 pieces
714   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
715 
716   // Merge/Unmerge
717   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
718     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
719     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
720 
721     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
722       const LLT &Ty = Query.Types[TypeIdx];
723       if (Ty.isVector()) {
724         const LLT &EltTy = Ty.getElementType();
725         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
726           return true;
727         if (!isPowerOf2_32(EltTy.getSizeInBits()))
728           return true;
729       }
730       return false;
731     };
732 
733     getActionDefinitionsBuilder(Op)
734       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
735       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
736       // worth considering the multiples of 64 since 2*192 and 2*384 are not
737       // valid.
738       .clampScalar(LitTyIdx, S16, S256)
739       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
740       .legalIf(all(typeIs(0, S16), typeIs(1, LLT::vector(3, 16)))) // FIXME: Testing hack
741       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
742                            elementTypeIs(1, S16)),
743                        changeTo(1, V2S16))
744       // Break up vectors with weird elements into scalars
745       .fewerElementsIf(
746         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
747         scalarize(0))
748       .fewerElementsIf(
749         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
750         scalarize(1))
751       .clampScalar(BigTyIdx, S32, S512)
752       .lowerFor({{S16, V2S16}})
753       .widenScalarIf(
754         [=](const LegalityQuery &Query) {
755           const LLT &Ty = Query.Types[BigTyIdx];
756           return !isPowerOf2_32(Ty.getSizeInBits()) &&
757                  Ty.getSizeInBits() % 16 != 0;
758         },
759         [=](const LegalityQuery &Query) {
760           // Pick the next power of 2, or a multiple of 64 over 128.
761           // Whichever is smaller.
762           const LLT &Ty = Query.Types[BigTyIdx];
763           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
764           if (NewSizeInBits >= 256) {
765             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
766             if (RoundedTo < NewSizeInBits)
767               NewSizeInBits = RoundedTo;
768           }
769           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
770         })
771       .legalIf([=](const LegalityQuery &Query) {
772           const LLT &BigTy = Query.Types[BigTyIdx];
773           const LLT &LitTy = Query.Types[LitTyIdx];
774 
775           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
776             return false;
777           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
778             return false;
779 
780           return BigTy.getSizeInBits() % 16 == 0 &&
781                  LitTy.getSizeInBits() % 16 == 0 &&
782                  BigTy.getSizeInBits() <= 512;
783         })
784       // Any vectors left are the wrong size. Scalarize them.
785       .scalarize(0)
786       .scalarize(1);
787   }
788 
789   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
790 
791   computeTables();
792   verify(*ST.getInstrInfo());
793 }
794 
795 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
796                                          MachineRegisterInfo &MRI,
797                                          MachineIRBuilder &MIRBuilder,
798                                          GISelChangeObserver &Observer) const {
799   switch (MI.getOpcode()) {
800   case TargetOpcode::G_ADDRSPACE_CAST:
801     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
802   case TargetOpcode::G_FRINT:
803     return legalizeFrint(MI, MRI, MIRBuilder);
804   case TargetOpcode::G_FCEIL:
805     return legalizeFceil(MI, MRI, MIRBuilder);
806   case TargetOpcode::G_INTRINSIC_TRUNC:
807     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
808   case TargetOpcode::G_SITOFP:
809     return legalizeITOFP(MI, MRI, MIRBuilder, true);
810   case TargetOpcode::G_UITOFP:
811     return legalizeITOFP(MI, MRI, MIRBuilder, false);
812   case TargetOpcode::G_FMINNUM:
813   case TargetOpcode::G_FMAXNUM:
814   case TargetOpcode::G_FMINNUM_IEEE:
815   case TargetOpcode::G_FMAXNUM_IEEE:
816     return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
817   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
818     return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
819   case TargetOpcode::G_INSERT_VECTOR_ELT:
820     return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
821   default:
822     return false;
823   }
824 
825   llvm_unreachable("expected switch to return");
826 }
827 
828 Register AMDGPULegalizerInfo::getSegmentAperture(
829   unsigned AS,
830   MachineRegisterInfo &MRI,
831   MachineIRBuilder &MIRBuilder) const {
832   MachineFunction &MF = MIRBuilder.getMF();
833   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
834   const LLT S32 = LLT::scalar(32);
835 
836   if (ST.hasApertureRegs()) {
837     // FIXME: Use inline constants (src_{shared, private}_base) instead of
838     // getreg.
839     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
840         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
841         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
842     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
843         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
844         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
845     unsigned Encoding =
846         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
847         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
848         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
849 
850     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
851     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
852 
853     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
854       .addDef(GetReg)
855       .addImm(Encoding);
856     MRI.setType(GetReg, S32);
857 
858     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
859     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
860       .addDef(ApertureReg)
861       .addUse(GetReg)
862       .addUse(ShiftAmt.getReg(0));
863 
864     return ApertureReg;
865   }
866 
867   Register QueuePtr = MRI.createGenericVirtualRegister(
868     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
869 
870   // FIXME: Placeholder until we can track the input registers.
871   MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
872 
873   // Offset into amd_queue_t for group_segment_aperture_base_hi /
874   // private_segment_aperture_base_hi.
875   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
876 
877   // FIXME: Don't use undef
878   Value *V = UndefValue::get(PointerType::get(
879                                Type::getInt8Ty(MF.getFunction().getContext()),
880                                AMDGPUAS::CONSTANT_ADDRESS));
881 
882   MachinePointerInfo PtrInfo(V, StructOffset);
883   MachineMemOperand *MMO = MF.getMachineMemOperand(
884     PtrInfo,
885     MachineMemOperand::MOLoad |
886     MachineMemOperand::MODereferenceable |
887     MachineMemOperand::MOInvariant,
888     4,
889     MinAlign(64, StructOffset));
890 
891   Register LoadResult = MRI.createGenericVirtualRegister(S32);
892   Register LoadAddr;
893 
894   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
895   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
896   return LoadResult;
897 }
898 
899 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
900   MachineInstr &MI, MachineRegisterInfo &MRI,
901   MachineIRBuilder &MIRBuilder) const {
902   MachineFunction &MF = MIRBuilder.getMF();
903 
904   MIRBuilder.setInstr(MI);
905 
906   Register Dst = MI.getOperand(0).getReg();
907   Register Src = MI.getOperand(1).getReg();
908 
909   LLT DstTy = MRI.getType(Dst);
910   LLT SrcTy = MRI.getType(Src);
911   unsigned DestAS = DstTy.getAddressSpace();
912   unsigned SrcAS = SrcTy.getAddressSpace();
913 
914   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
915   // vector element.
916   assert(!DstTy.isVector());
917 
918   const AMDGPUTargetMachine &TM
919     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
920 
921   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
922   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
923     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
924     return true;
925   }
926 
927   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
928     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
929            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
930     unsigned NullVal = TM.getNullPointerValue(DestAS);
931 
932     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
933     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
934 
935     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
936 
937     // Extract low 32-bits of the pointer.
938     MIRBuilder.buildExtract(PtrLo32, Src, 0);
939 
940     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
941     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
942     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
943 
944     MI.eraseFromParent();
945     return true;
946   }
947 
948   assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
949          SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
950 
951   auto SegmentNull =
952       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
953   auto FlatNull =
954       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
955 
956   Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
957 
958   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
959   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
960 
961   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
962 
963   // Coerce the type of the low half of the result so we can use merge_values.
964   Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
965   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
966     .addDef(SrcAsInt)
967     .addUse(Src);
968 
969   // TODO: Should we allow mismatched types but matching sizes in merges to
970   // avoid the ptrtoint?
971   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
972   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
973 
974   MI.eraseFromParent();
975   return true;
976 }
977 
978 bool AMDGPULegalizerInfo::legalizeFrint(
979   MachineInstr &MI, MachineRegisterInfo &MRI,
980   MachineIRBuilder &MIRBuilder) const {
981   MIRBuilder.setInstr(MI);
982 
983   Register Src = MI.getOperand(1).getReg();
984   LLT Ty = MRI.getType(Src);
985   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
986 
987   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
988   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
989 
990   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
991   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
992 
993   // TODO: Should this propagate fast-math-flags?
994   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
995   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
996 
997   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
998   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
999 
1000   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1001   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1002   return true;
1003 }
1004 
1005 bool AMDGPULegalizerInfo::legalizeFceil(
1006   MachineInstr &MI, MachineRegisterInfo &MRI,
1007   MachineIRBuilder &B) const {
1008   B.setInstr(MI);
1009 
1010   const LLT S1 = LLT::scalar(1);
1011   const LLT S64 = LLT::scalar(64);
1012 
1013   Register Src = MI.getOperand(1).getReg();
1014   assert(MRI.getType(Src) == S64);
1015 
1016   // result = trunc(src)
1017   // if (src > 0.0 && src != result)
1018   //   result += 1.0
1019 
1020   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1021 
1022   const auto Zero = B.buildFConstant(S64, 0.0);
1023   const auto One = B.buildFConstant(S64, 1.0);
1024   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1025   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1026   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1027   auto Add = B.buildSelect(S64, And, One, Zero);
1028 
1029   // TODO: Should this propagate fast-math-flags?
1030   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1031   return true;
1032 }
1033 
1034 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1035                                               MachineIRBuilder &B) {
1036   const unsigned FractBits = 52;
1037   const unsigned ExpBits = 11;
1038   LLT S32 = LLT::scalar(32);
1039 
1040   auto Const0 = B.buildConstant(S32, FractBits - 32);
1041   auto Const1 = B.buildConstant(S32, ExpBits);
1042 
1043   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1044     .addUse(Const0.getReg(0))
1045     .addUse(Const1.getReg(0));
1046 
1047   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1048 }
1049 
1050 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1051   MachineInstr &MI, MachineRegisterInfo &MRI,
1052   MachineIRBuilder &B) const {
1053   B.setInstr(MI);
1054 
1055   const LLT S1 = LLT::scalar(1);
1056   const LLT S32 = LLT::scalar(32);
1057   const LLT S64 = LLT::scalar(64);
1058 
1059   Register Src = MI.getOperand(1).getReg();
1060   assert(MRI.getType(Src) == S64);
1061 
1062   // TODO: Should this use extract since the low half is unused?
1063   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1064   Register Hi = Unmerge.getReg(1);
1065 
1066   // Extract the upper half, since this is where we will find the sign and
1067   // exponent.
1068   auto Exp = extractF64Exponent(Hi, B);
1069 
1070   const unsigned FractBits = 52;
1071 
1072   // Extract the sign bit.
1073   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1074   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1075 
1076   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1077 
1078   const auto Zero32 = B.buildConstant(S32, 0);
1079 
1080   // Extend back to 64-bits.
1081   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1082 
1083   auto Shr = B.buildAShr(S64, FractMask, Exp);
1084   auto Not = B.buildNot(S64, Shr);
1085   auto Tmp0 = B.buildAnd(S64, Src, Not);
1086   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1087 
1088   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1089   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1090 
1091   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1092   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1093   return true;
1094 }
1095 
1096 bool AMDGPULegalizerInfo::legalizeITOFP(
1097   MachineInstr &MI, MachineRegisterInfo &MRI,
1098   MachineIRBuilder &B, bool Signed) const {
1099   B.setInstr(MI);
1100 
1101   Register Dst = MI.getOperand(0).getReg();
1102   Register Src = MI.getOperand(1).getReg();
1103 
1104   const LLT S64 = LLT::scalar(64);
1105   const LLT S32 = LLT::scalar(32);
1106 
1107   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1108 
1109   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1110 
1111   auto CvtHi = Signed ?
1112     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1113     B.buildUITOFP(S64, Unmerge.getReg(1));
1114 
1115   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1116 
1117   auto ThirtyTwo = B.buildConstant(S32, 32);
1118   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1119     .addUse(CvtHi.getReg(0))
1120     .addUse(ThirtyTwo.getReg(0));
1121 
1122   // TODO: Should this propagate fast-math-flags?
1123   B.buildFAdd(Dst, LdExp, CvtLo);
1124   MI.eraseFromParent();
1125   return true;
1126 }
1127 
1128 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1129   MachineInstr &MI, MachineRegisterInfo &MRI,
1130   MachineIRBuilder &B) const {
1131   MachineFunction &MF = B.getMF();
1132   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1133 
1134   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1135                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1136 
1137   // With ieee_mode disabled, the instructions have the correct behavior
1138   // already for G_FMINNUM/G_FMAXNUM
1139   if (!MFI->getMode().IEEE)
1140     return !IsIEEEOp;
1141 
1142   if (IsIEEEOp)
1143     return true;
1144 
1145   MachineIRBuilder HelperBuilder(MI);
1146   GISelObserverWrapper DummyObserver;
1147   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1148   HelperBuilder.setMBB(*MI.getParent());
1149   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1150 }
1151 
1152 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1153   MachineInstr &MI, MachineRegisterInfo &MRI,
1154   MachineIRBuilder &B) const {
1155   // TODO: Should move some of this into LegalizerHelper.
1156 
1157   // TODO: Promote dynamic indexing of s16 to s32
1158   // TODO: Dynamic s64 indexing is only legal for SGPR.
1159   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1160   if (!IdxVal) // Dynamic case will be selected to register indexing.
1161     return true;
1162 
1163   Register Dst = MI.getOperand(0).getReg();
1164   Register Vec = MI.getOperand(1).getReg();
1165 
1166   LLT VecTy = MRI.getType(Vec);
1167   LLT EltTy = VecTy.getElementType();
1168   assert(EltTy == MRI.getType(Dst));
1169 
1170   B.setInstr(MI);
1171 
1172   if (IdxVal.getValue() < VecTy.getNumElements())
1173     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1174   else
1175     B.buildUndef(Dst);
1176 
1177   MI.eraseFromParent();
1178   return true;
1179 }
1180 
1181 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1182   MachineInstr &MI, MachineRegisterInfo &MRI,
1183   MachineIRBuilder &B) const {
1184   // TODO: Should move some of this into LegalizerHelper.
1185 
1186   // TODO: Promote dynamic indexing of s16 to s32
1187   // TODO: Dynamic s64 indexing is only legal for SGPR.
1188   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1189   if (!IdxVal) // Dynamic case will be selected to register indexing.
1190     return true;
1191 
1192   Register Dst = MI.getOperand(0).getReg();
1193   Register Vec = MI.getOperand(1).getReg();
1194   Register Ins = MI.getOperand(2).getReg();
1195 
1196   LLT VecTy = MRI.getType(Vec);
1197   LLT EltTy = VecTy.getElementType();
1198   assert(EltTy == MRI.getType(Ins));
1199 
1200   B.setInstr(MI);
1201 
1202   if (IdxVal.getValue() < VecTy.getNumElements())
1203     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1204   else
1205     B.buildUndef(Dst);
1206 
1207   MI.eraseFromParent();
1208   return true;
1209 }
1210 
1211 // Return the use branch instruction, otherwise null if the usage is invalid.
1212 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1213                                        MachineRegisterInfo &MRI) {
1214   Register CondDef = MI.getOperand(0).getReg();
1215   if (!MRI.hasOneNonDBGUse(CondDef))
1216     return nullptr;
1217 
1218   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1219   return UseMI.getParent() == MI.getParent() &&
1220     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1221 }
1222 
1223 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1224                                                 Register Reg, LLT Ty) const {
1225   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1226   if (LiveIn)
1227     return LiveIn;
1228 
1229   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1230   MRI.addLiveIn(Reg, NewReg);
1231   return NewReg;
1232 }
1233 
1234 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1235                                          const ArgDescriptor *Arg) const {
1236   if (!Arg->isRegister())
1237     return false; // TODO: Handle these
1238 
1239   assert(Arg->getRegister() != 0);
1240   assert(Arg->getRegister().isPhysical());
1241 
1242   MachineRegisterInfo &MRI = *B.getMRI();
1243 
1244   LLT Ty = MRI.getType(DstReg);
1245   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1246 
1247   if (Arg->isMasked()) {
1248     // TODO: Should we try to emit this once in the entry block?
1249     const LLT S32 = LLT::scalar(32);
1250     const unsigned Mask = Arg->getMask();
1251     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1252 
1253     auto ShiftAmt = B.buildConstant(S32, Shift);
1254     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1255     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1256   } else
1257     B.buildCopy(DstReg, LiveIn);
1258 
1259   // Insert the argument copy if it doens't already exist.
1260   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1261   if (!MRI.getVRegDef(LiveIn)) {
1262     MachineBasicBlock &EntryMBB = B.getMF().front();
1263     EntryMBB.addLiveIn(Arg->getRegister());
1264     B.setInsertPt(EntryMBB, EntryMBB.begin());
1265     B.buildCopy(LiveIn, Arg->getRegister());
1266   }
1267 
1268   return true;
1269 }
1270 
1271 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1272   MachineInstr &MI,
1273   MachineRegisterInfo &MRI,
1274   MachineIRBuilder &B,
1275   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1276   B.setInstr(MI);
1277 
1278   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1279 
1280   const ArgDescriptor *Arg;
1281   const TargetRegisterClass *RC;
1282   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1283   if (!Arg) {
1284     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1285     return false;
1286   }
1287 
1288   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1289     MI.eraseFromParent();
1290     return true;
1291   }
1292 
1293   return false;
1294 }
1295 
1296 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1297                                            MachineRegisterInfo &MRI,
1298                                            MachineIRBuilder &B) const {
1299   B.setInstr(MI);
1300   Register Res = MI.getOperand(0).getReg();
1301   Register LHS = MI.getOperand(2).getReg();
1302   Register RHS = MI.getOperand(3).getReg();
1303   uint16_t Flags = MI.getFlags();
1304 
1305   LLT S32 = LLT::scalar(32);
1306   LLT S1 = LLT::scalar(1);
1307 
1308   auto Abs = B.buildFAbs(S32, RHS, Flags);
1309   const APFloat C0Val(1.0f);
1310 
1311   auto C0 = B.buildConstant(S32, 0x6f800000);
1312   auto C1 = B.buildConstant(S32, 0x2f800000);
1313   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1314 
1315   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1316   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1317 
1318   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1319 
1320   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1321     .addUse(Mul0.getReg(0))
1322     .setMIFlags(Flags);
1323 
1324   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1325 
1326   B.buildFMul(Res, Sel, Mul1, Flags);
1327 
1328   MI.eraseFromParent();
1329   return true;
1330 }
1331 
1332 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1333                                                  MachineRegisterInfo &MRI,
1334                                                  MachineIRBuilder &B) const {
1335   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1336   if (!MFI->isEntryFunction()) {
1337     return legalizePreloadedArgIntrin(MI, MRI, B,
1338                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1339   }
1340 
1341   B.setInstr(MI);
1342 
1343   uint64_t Offset =
1344     ST.getTargetLowering()->getImplicitParameterOffset(
1345       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1346   Register DstReg = MI.getOperand(0).getReg();
1347   LLT DstTy = MRI.getType(DstReg);
1348   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1349 
1350   const ArgDescriptor *Arg;
1351   const TargetRegisterClass *RC;
1352   std::tie(Arg, RC)
1353     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1354   if (!Arg)
1355     return false;
1356 
1357   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1358   if (!loadInputValue(KernargPtrReg, B, Arg))
1359     return false;
1360 
1361   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1362   MI.eraseFromParent();
1363   return true;
1364 }
1365 
1366 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1367                                             MachineRegisterInfo &MRI,
1368                                             MachineIRBuilder &B) const {
1369   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1370   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1371   case Intrinsic::amdgcn_if: {
1372     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1373       const SIRegisterInfo *TRI
1374         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1375 
1376       B.setInstr(*BrCond);
1377       Register Def = MI.getOperand(1).getReg();
1378       Register Use = MI.getOperand(3).getReg();
1379       B.buildInstr(AMDGPU::SI_IF)
1380         .addDef(Def)
1381         .addUse(Use)
1382         .addMBB(BrCond->getOperand(1).getMBB());
1383 
1384       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1385       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1386       MI.eraseFromParent();
1387       BrCond->eraseFromParent();
1388       return true;
1389     }
1390 
1391     return false;
1392   }
1393   case Intrinsic::amdgcn_loop: {
1394     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1395       const SIRegisterInfo *TRI
1396         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1397 
1398       B.setInstr(*BrCond);
1399       Register Reg = MI.getOperand(2).getReg();
1400       B.buildInstr(AMDGPU::SI_LOOP)
1401         .addUse(Reg)
1402         .addMBB(BrCond->getOperand(1).getMBB());
1403       MI.eraseFromParent();
1404       BrCond->eraseFromParent();
1405       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1406       return true;
1407     }
1408 
1409     return false;
1410   }
1411   case Intrinsic::amdgcn_kernarg_segment_ptr:
1412     return legalizePreloadedArgIntrin(
1413       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1414   case Intrinsic::amdgcn_implicitarg_ptr:
1415     return legalizeImplicitArgPtr(MI, MRI, B);
1416   case Intrinsic::amdgcn_workitem_id_x:
1417     return legalizePreloadedArgIntrin(MI, MRI, B,
1418                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1419   case Intrinsic::amdgcn_workitem_id_y:
1420     return legalizePreloadedArgIntrin(MI, MRI, B,
1421                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1422   case Intrinsic::amdgcn_workitem_id_z:
1423     return legalizePreloadedArgIntrin(MI, MRI, B,
1424                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1425   case Intrinsic::amdgcn_workgroup_id_x:
1426     return legalizePreloadedArgIntrin(MI, MRI, B,
1427                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1428   case Intrinsic::amdgcn_workgroup_id_y:
1429     return legalizePreloadedArgIntrin(MI, MRI, B,
1430                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1431   case Intrinsic::amdgcn_workgroup_id_z:
1432     return legalizePreloadedArgIntrin(MI, MRI, B,
1433                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1434   case Intrinsic::amdgcn_dispatch_ptr:
1435     return legalizePreloadedArgIntrin(MI, MRI, B,
1436                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1437   case Intrinsic::amdgcn_queue_ptr:
1438     return legalizePreloadedArgIntrin(MI, MRI, B,
1439                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1440   case Intrinsic::amdgcn_implicit_buffer_ptr:
1441     return legalizePreloadedArgIntrin(
1442       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1443   case Intrinsic::amdgcn_dispatch_id:
1444     return legalizePreloadedArgIntrin(MI, MRI, B,
1445                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1446   case Intrinsic::amdgcn_fdiv_fast:
1447     return legalizeFDIVFast(MI, MRI, B);
1448   default:
1449     return true;
1450   }
1451 
1452   return true;
1453 }
1454