1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIMachineFunctionInfo.h"
18 
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/TargetOpcodes.h"
21 #include "llvm/CodeGen/ValueTypes.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/Support/Debug.h"
25 
26 using namespace llvm;
27 using namespace LegalizeActions;
28 using namespace LegalizeMutations;
29 using namespace LegalityPredicates;
30 
31 
32 static LegalityPredicate isMultiple32(unsigned TypeIdx,
33                                       unsigned MaxSize = 512) {
34   return [=](const LegalityQuery &Query) {
35     const LLT Ty = Query.Types[TypeIdx];
36     const LLT EltTy = Ty.getScalarType();
37     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
38   };
39 }
40 
41 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
42   return [=](const LegalityQuery &Query) {
43     const LLT Ty = Query.Types[TypeIdx];
44     return Ty.isVector() &&
45            Ty.getNumElements() % 2 != 0 &&
46            Ty.getElementType().getSizeInBits() < 32;
47   };
48 }
49 
50 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
51   return [=](const LegalityQuery &Query) {
52     const LLT Ty = Query.Types[TypeIdx];
53     const LLT EltTy = Ty.getElementType();
54     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
55   };
56 }
57 
58 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
59   return [=](const LegalityQuery &Query) {
60     const LLT Ty = Query.Types[TypeIdx];
61     const LLT EltTy = Ty.getElementType();
62     unsigned Size = Ty.getSizeInBits();
63     unsigned Pieces = (Size + 63) / 64;
64     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
65     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
66   };
67 }
68 
69 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
70   return [=](const LegalityQuery &Query) {
71     const LLT QueryTy = Query.Types[TypeIdx];
72     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
73   };
74 }
75 
76 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
77   return [=](const LegalityQuery &Query) {
78     const LLT QueryTy = Query.Types[TypeIdx];
79     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
80   };
81 }
82 
83 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST,
84                                          const GCNTargetMachine &TM) {
85   using namespace TargetOpcode;
86 
87   auto GetAddrSpacePtr = [&TM](unsigned AS) {
88     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
89   };
90 
91   const LLT S1 = LLT::scalar(1);
92   const LLT S8 = LLT::scalar(8);
93   const LLT S16 = LLT::scalar(16);
94   const LLT S32 = LLT::scalar(32);
95   const LLT S64 = LLT::scalar(64);
96   const LLT S128 = LLT::scalar(128);
97   const LLT S256 = LLT::scalar(256);
98   const LLT S512 = LLT::scalar(512);
99 
100   const LLT V2S16 = LLT::vector(2, 16);
101   const LLT V4S16 = LLT::vector(4, 16);
102   const LLT V8S16 = LLT::vector(8, 16);
103 
104   const LLT V2S32 = LLT::vector(2, 32);
105   const LLT V3S32 = LLT::vector(3, 32);
106   const LLT V4S32 = LLT::vector(4, 32);
107   const LLT V5S32 = LLT::vector(5, 32);
108   const LLT V6S32 = LLT::vector(6, 32);
109   const LLT V7S32 = LLT::vector(7, 32);
110   const LLT V8S32 = LLT::vector(8, 32);
111   const LLT V9S32 = LLT::vector(9, 32);
112   const LLT V10S32 = LLT::vector(10, 32);
113   const LLT V11S32 = LLT::vector(11, 32);
114   const LLT V12S32 = LLT::vector(12, 32);
115   const LLT V13S32 = LLT::vector(13, 32);
116   const LLT V14S32 = LLT::vector(14, 32);
117   const LLT V15S32 = LLT::vector(15, 32);
118   const LLT V16S32 = LLT::vector(16, 32);
119 
120   const LLT V2S64 = LLT::vector(2, 64);
121   const LLT V3S64 = LLT::vector(3, 64);
122   const LLT V4S64 = LLT::vector(4, 64);
123   const LLT V5S64 = LLT::vector(5, 64);
124   const LLT V6S64 = LLT::vector(6, 64);
125   const LLT V7S64 = LLT::vector(7, 64);
126   const LLT V8S64 = LLT::vector(8, 64);
127 
128   std::initializer_list<LLT> AllS32Vectors =
129     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
130      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
131   std::initializer_list<LLT> AllS64Vectors =
132     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
133 
134   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
135   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
136   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
137   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
138   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
139 
140   const LLT CodePtr = FlatPtr;
141 
142   const std::initializer_list<LLT> AddrSpaces64 = {
143     GlobalPtr, ConstantPtr, FlatPtr
144   };
145 
146   const std::initializer_list<LLT> AddrSpaces32 = {
147     LocalPtr, PrivatePtr
148   };
149 
150   setAction({G_BRCOND, S1}, Legal);
151 
152   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
153   // elements for v3s16
154   getActionDefinitionsBuilder(G_PHI)
155     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
156     .legalFor(AllS32Vectors)
157     .legalFor(AllS64Vectors)
158     .legalFor(AddrSpaces64)
159     .legalFor(AddrSpaces32)
160     .clampScalar(0, S32, S256)
161     .widenScalarToNextPow2(0, 32)
162     .clampMaxNumElements(0, S32, 16)
163     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
164     .legalIf(isPointer(0));
165 
166 
167   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_UMULH, G_SMULH})
168     .legalFor({S32})
169     .clampScalar(0, S32, S32)
170     .scalarize(0);
171 
172   // Report legal for any types we can handle anywhere. For the cases only legal
173   // on the SALU, RegBankSelect will be able to re-legalize.
174   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
175     .legalFor({S32, S1, S64, V2S32, V2S16, V4S16})
176     .clampScalar(0, S32, S64)
177     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
178     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
179     .widenScalarToNextPow2(0)
180     .scalarize(0);
181 
182   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
183                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
184     .legalFor({{S32, S1}})
185     .clampScalar(0, S32, S32);
186 
187   getActionDefinitionsBuilder(G_BITCAST)
188     .legalForCartesianProduct({S32, V2S16})
189     .legalForCartesianProduct({S64, V2S32, V4S16})
190     .legalForCartesianProduct({V2S64, V4S32})
191     // Don't worry about the size constraint.
192     .legalIf(all(isPointer(0), isPointer(1)));
193 
194   if (ST.has16BitInsts()) {
195     getActionDefinitionsBuilder(G_FCONSTANT)
196       .legalFor({S32, S64, S16})
197       .clampScalar(0, S16, S64);
198   } else {
199     getActionDefinitionsBuilder(G_FCONSTANT)
200       .legalFor({S32, S64})
201       .clampScalar(0, S32, S64);
202   }
203 
204   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
205     .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
206                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
207     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
208     .clampScalarOrElt(0, S32, S512)
209     .legalIf(isMultiple32(0))
210     .widenScalarToNextPow2(0, 32)
211     .clampMaxNumElements(0, S32, 16);
212 
213 
214   // FIXME: i1 operands to intrinsics should always be legal, but other i1
215   // values may not be legal.  We need to figure out how to distinguish
216   // between these two scenarios.
217   getActionDefinitionsBuilder(G_CONSTANT)
218     .legalFor({S1, S32, S64, GlobalPtr,
219                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
220     .clampScalar(0, S32, S64)
221     .widenScalarToNextPow2(0)
222     .legalIf(isPointer(0));
223 
224   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
225 
226   auto &FPOpActions = getActionDefinitionsBuilder(
227     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
228     .legalFor({S32, S64});
229 
230   if (ST.has16BitInsts()) {
231     if (ST.hasVOP3PInsts())
232       FPOpActions.legalFor({S16, V2S16});
233     else
234       FPOpActions.legalFor({S16});
235   }
236 
237   if (ST.hasVOP3PInsts())
238     FPOpActions.clampMaxNumElements(0, S16, 2);
239   FPOpActions
240     .scalarize(0)
241     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
242 
243   if (ST.has16BitInsts()) {
244     getActionDefinitionsBuilder(G_FSQRT)
245       .legalFor({S32, S64, S16})
246       .scalarize(0)
247       .clampScalar(0, S16, S64);
248   } else {
249     getActionDefinitionsBuilder(G_FSQRT)
250       .legalFor({S32, S64})
251       .scalarize(0)
252       .clampScalar(0, S32, S64);
253   }
254 
255   getActionDefinitionsBuilder(G_FPTRUNC)
256     .legalFor({{S32, S64}, {S16, S32}})
257     .scalarize(0);
258 
259   getActionDefinitionsBuilder(G_FPEXT)
260     .legalFor({{S64, S32}, {S32, S16}})
261     .lowerFor({{S64, S16}}) // FIXME: Implement
262     .scalarize(0);
263 
264   getActionDefinitionsBuilder(G_FCOPYSIGN)
265     .legalForCartesianProduct({S16, S32, S64}, {S16, S32, S64})
266     .scalarize(0);
267 
268   getActionDefinitionsBuilder(G_FSUB)
269       // Use actual fsub instruction
270       .legalFor({S32})
271       // Must use fadd + fneg
272       .lowerFor({S64, S16, V2S16})
273       .scalarize(0)
274       .clampScalar(0, S32, S64);
275 
276   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
277     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
278                {S32, S1}, {S64, S1}, {S16, S1},
279                // FIXME: Hack
280                {S64, LLT::scalar(33)},
281                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
282     .scalarize(0);
283 
284   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
285     .legalFor({{S32, S32}, {S64, S32}})
286     .lowerFor({{S32, S64}})
287     .customFor({{S64, S64}})
288     .scalarize(0);
289 
290   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
291     .legalFor({{S32, S32}, {S32, S64}})
292     .scalarize(0);
293 
294   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
295     .legalFor({S32, S64})
296     .scalarize(0);
297 
298   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
299     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
300       .legalFor({S32, S64})
301       .clampScalar(0, S32, S64)
302       .scalarize(0);
303   } else {
304     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
305       .legalFor({S32})
306       .customFor({S64})
307       .clampScalar(0, S32, S64)
308       .scalarize(0);
309   }
310 
311   getActionDefinitionsBuilder(G_GEP)
312     .legalForCartesianProduct(AddrSpaces64, {S64})
313     .legalForCartesianProduct(AddrSpaces32, {S32})
314     .scalarize(0);
315 
316   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
317 
318   getActionDefinitionsBuilder(G_ICMP)
319     .legalForCartesianProduct(
320       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
321     .legalFor({{S1, S32}, {S1, S64}})
322     .widenScalarToNextPow2(1)
323     .clampScalar(1, S32, S64)
324     .scalarize(0)
325     .legalIf(all(typeIs(0, S1), isPointer(1)));
326 
327   getActionDefinitionsBuilder(G_FCMP)
328     .legalFor({{S1, S32}, {S1, S64}})
329     .widenScalarToNextPow2(1)
330     .clampScalar(1, S32, S64)
331     .scalarize(0);
332 
333   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
334   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
335                                G_FLOG, G_FLOG2, G_FLOG10})
336     .legalFor({S32})
337     .scalarize(0);
338 
339   // The 64-bit versions produce 32-bit results, but only on the SALU.
340   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
341                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
342                                G_CTPOP})
343     .legalFor({{S32, S32}, {S32, S64}})
344     .clampScalar(0, S32, S32)
345     .clampScalar(1, S32, S64)
346     .scalarize(0)
347     .widenScalarToNextPow2(0, 32)
348     .widenScalarToNextPow2(1, 32);
349 
350   // TODO: Expand for > s32
351   getActionDefinitionsBuilder(G_BSWAP)
352     .legalFor({S32})
353     .clampScalar(0, S32, S32)
354     .scalarize(0);
355 
356 
357   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
358     return [=](const LegalityQuery &Query) {
359       return Query.Types[TypeIdx0].getSizeInBits() <
360              Query.Types[TypeIdx1].getSizeInBits();
361     };
362   };
363 
364   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
365     return [=](const LegalityQuery &Query) {
366       return Query.Types[TypeIdx0].getSizeInBits() >
367              Query.Types[TypeIdx1].getSizeInBits();
368     };
369   };
370 
371   getActionDefinitionsBuilder(G_INTTOPTR)
372     // List the common cases
373     .legalForCartesianProduct(AddrSpaces64, {S64})
374     .legalForCartesianProduct(AddrSpaces32, {S32})
375     .scalarize(0)
376     // Accept any address space as long as the size matches
377     .legalIf(sameSize(0, 1))
378     .widenScalarIf(smallerThan(1, 0),
379       [](const LegalityQuery &Query) {
380         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
381       })
382     .narrowScalarIf(greaterThan(1, 0),
383       [](const LegalityQuery &Query) {
384         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
385       });
386 
387   getActionDefinitionsBuilder(G_PTRTOINT)
388     // List the common cases
389     .legalForCartesianProduct(AddrSpaces64, {S64})
390     .legalForCartesianProduct(AddrSpaces32, {S32})
391     .scalarize(0)
392     // Accept any address space as long as the size matches
393     .legalIf(sameSize(0, 1))
394     .widenScalarIf(smallerThan(0, 1),
395       [](const LegalityQuery &Query) {
396         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
397       })
398     .narrowScalarIf(
399       greaterThan(0, 1),
400       [](const LegalityQuery &Query) {
401         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
402       });
403 
404   if (ST.hasFlatAddressSpace()) {
405     getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
406       .scalarize(0)
407       .custom();
408   }
409 
410   getActionDefinitionsBuilder({G_LOAD, G_STORE})
411     .narrowScalarIf([](const LegalityQuery &Query) {
412         unsigned Size = Query.Types[0].getSizeInBits();
413         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
414         return (Size > 32 && MemSize < Size);
415       },
416       [](const LegalityQuery &Query) {
417         return std::make_pair(0, LLT::scalar(32));
418       })
419     .fewerElementsIf([=, &ST](const LegalityQuery &Query) {
420         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
421         return (MemSize == 96) &&
422                Query.Types[0].isVector() &&
423                ST.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS;
424       },
425       [=](const LegalityQuery &Query) {
426         return std::make_pair(0, V2S32);
427       })
428     .legalIf([=, &ST](const LegalityQuery &Query) {
429         const LLT &Ty0 = Query.Types[0];
430 
431         unsigned Size = Ty0.getSizeInBits();
432         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
433         if (Size < 32 || (Size > 32 && MemSize < Size))
434           return false;
435 
436         if (Ty0.isVector() && Size != MemSize)
437           return false;
438 
439         // TODO: Decompose private loads into 4-byte components.
440         // TODO: Illegal flat loads on SI
441         switch (MemSize) {
442         case 8:
443         case 16:
444           return Size == 32;
445         case 32:
446         case 64:
447         case 128:
448           return true;
449 
450         case 96:
451           // XXX hasLoadX3
452           return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS);
453 
454         case 256:
455         case 512:
456           // TODO: constant loads
457         default:
458           return false;
459         }
460       })
461     .clampScalar(0, S32, S64);
462 
463 
464   // FIXME: Handle alignment requirements.
465   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
466     .legalForTypesWithMemDesc({
467         {S32, GlobalPtr, 8, 8},
468         {S32, GlobalPtr, 16, 8},
469         {S32, LocalPtr, 8, 8},
470         {S32, LocalPtr, 16, 8},
471         {S32, PrivatePtr, 8, 8},
472         {S32, PrivatePtr, 16, 8}});
473   if (ST.hasFlatAddressSpace()) {
474     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
475                                        {S32, FlatPtr, 16, 8}});
476   }
477 
478   ExtLoads.clampScalar(0, S32, S32)
479           .widenScalarToNextPow2(0)
480           .unsupportedIfMemSizeNotPow2()
481           .lower();
482 
483   auto &Atomics = getActionDefinitionsBuilder(
484     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
485      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
486      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
487      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
488     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
489                {S64, GlobalPtr}, {S64, LocalPtr}});
490   if (ST.hasFlatAddressSpace()) {
491     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
492   }
493 
494   // TODO: Pointer types, any 32-bit or 64-bit vector
495   getActionDefinitionsBuilder(G_SELECT)
496     .legalForCartesianProduct({S32, S64, V2S32, V2S16, V4S16,
497           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
498           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
499     .clampScalar(0, S32, S64)
500     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
501     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
502     .scalarize(1)
503     .clampMaxNumElements(0, S32, 2)
504     .clampMaxNumElements(0, LocalPtr, 2)
505     .clampMaxNumElements(0, PrivatePtr, 2)
506     .scalarize(0)
507     .widenScalarToNextPow2(0)
508     .legalIf(all(isPointer(0), typeIs(1, S1)));
509 
510   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
511   // be more flexible with the shift amount type.
512   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
513     .legalFor({{S32, S32}, {S64, S32}});
514   if (ST.has16BitInsts()) {
515     if (ST.hasVOP3PInsts()) {
516       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
517             .clampMaxNumElements(0, S16, 2);
518     } else
519       Shifts.legalFor({{S16, S32}, {S16, S16}});
520 
521     Shifts.clampScalar(1, S16, S32);
522     Shifts.clampScalar(0, S16, S64);
523     Shifts.widenScalarToNextPow2(0, 16);
524   } else {
525     // Make sure we legalize the shift amount type first, as the general
526     // expansion for the shifted type will produce much worse code if it hasn't
527     // been truncated already.
528     Shifts.clampScalar(1, S32, S32);
529     Shifts.clampScalar(0, S32, S64);
530     Shifts.widenScalarToNextPow2(0, 32);
531   }
532   Shifts.scalarize(0);
533 
534   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
535     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
536     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
537     unsigned IdxTypeIdx = 2;
538 
539     getActionDefinitionsBuilder(Op)
540       .legalIf([=](const LegalityQuery &Query) {
541           const LLT &VecTy = Query.Types[VecTypeIdx];
542           const LLT &IdxTy = Query.Types[IdxTypeIdx];
543           return VecTy.getSizeInBits() % 32 == 0 &&
544             VecTy.getSizeInBits() <= 512 &&
545             IdxTy.getSizeInBits() == 32;
546         })
547       .clampScalar(EltTypeIdx, S32, S64)
548       .clampScalar(VecTypeIdx, S32, S64)
549       .clampScalar(IdxTypeIdx, S32, S32);
550   }
551 
552   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
553     .unsupportedIf([=](const LegalityQuery &Query) {
554         const LLT &EltTy = Query.Types[1].getElementType();
555         return Query.Types[0] != EltTy;
556       });
557 
558   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
559     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
560     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
561 
562     // FIXME: Doesn't handle extract of illegal sizes.
563     getActionDefinitionsBuilder(Op)
564       .legalIf([=](const LegalityQuery &Query) {
565           const LLT BigTy = Query.Types[BigTyIdx];
566           const LLT LitTy = Query.Types[LitTyIdx];
567           return (BigTy.getSizeInBits() % 32 == 0) &&
568                  (LitTy.getSizeInBits() % 16 == 0);
569         })
570       .widenScalarIf(
571         [=](const LegalityQuery &Query) {
572           const LLT BigTy = Query.Types[BigTyIdx];
573           return (BigTy.getScalarSizeInBits() < 16);
574         },
575         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
576       .widenScalarIf(
577         [=](const LegalityQuery &Query) {
578           const LLT LitTy = Query.Types[LitTyIdx];
579           return (LitTy.getScalarSizeInBits() < 16);
580         },
581         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
582       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
583       .widenScalarToNextPow2(BigTyIdx, 32);
584 
585   }
586 
587   // TODO: vectors of pointers
588   getActionDefinitionsBuilder(G_BUILD_VECTOR)
589       .legalForCartesianProduct(AllS32Vectors, {S32})
590       .legalForCartesianProduct(AllS64Vectors, {S64})
591       .clampNumElements(0, V16S32, V16S32)
592       .clampNumElements(0, V2S64, V8S64)
593       .minScalarSameAs(1, 0)
594       // FIXME: Sort of a hack to make progress on other legalizations.
595       .legalIf([=](const LegalityQuery &Query) {
596         return Query.Types[0].getScalarSizeInBits() <= 32 ||
597                Query.Types[0].getScalarSizeInBits() == 64;
598       });
599 
600   // TODO: Support any combination of v2s32
601   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
602     .legalFor({{V4S32, V2S32},
603                {V8S32, V2S32},
604                {V8S32, V4S32},
605                {V4S64, V2S64},
606                {V4S16, V2S16},
607                {V8S16, V2S16},
608                {V8S16, V4S16},
609                {LLT::vector(4, LocalPtr), LLT::vector(2, LocalPtr)},
610                {LLT::vector(4, PrivatePtr), LLT::vector(2, PrivatePtr)}});
611 
612   // Merge/Unmerge
613   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
614     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
615     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
616 
617     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
618       const LLT &Ty = Query.Types[TypeIdx];
619       if (Ty.isVector()) {
620         const LLT &EltTy = Ty.getElementType();
621         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
622           return true;
623         if (!isPowerOf2_32(EltTy.getSizeInBits()))
624           return true;
625       }
626       return false;
627     };
628 
629     getActionDefinitionsBuilder(Op)
630       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
631       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
632       // worth considering the multiples of 64 since 2*192 and 2*384 are not
633       // valid.
634       .clampScalar(LitTyIdx, S16, S256)
635       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
636 
637       // Break up vectors with weird elements into scalars
638       .fewerElementsIf(
639         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
640         scalarize(0))
641       .fewerElementsIf(
642         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
643         scalarize(1))
644       .clampScalar(BigTyIdx, S32, S512)
645       .widenScalarIf(
646         [=](const LegalityQuery &Query) {
647           const LLT &Ty = Query.Types[BigTyIdx];
648           return !isPowerOf2_32(Ty.getSizeInBits()) &&
649                  Ty.getSizeInBits() % 16 != 0;
650         },
651         [=](const LegalityQuery &Query) {
652           // Pick the next power of 2, or a multiple of 64 over 128.
653           // Whichever is smaller.
654           const LLT &Ty = Query.Types[BigTyIdx];
655           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
656           if (NewSizeInBits >= 256) {
657             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
658             if (RoundedTo < NewSizeInBits)
659               NewSizeInBits = RoundedTo;
660           }
661           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
662         })
663       .legalIf([=](const LegalityQuery &Query) {
664           const LLT &BigTy = Query.Types[BigTyIdx];
665           const LLT &LitTy = Query.Types[LitTyIdx];
666 
667           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
668             return false;
669           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
670             return false;
671 
672           return BigTy.getSizeInBits() % 16 == 0 &&
673                  LitTy.getSizeInBits() % 16 == 0 &&
674                  BigTy.getSizeInBits() <= 512;
675         })
676       // Any vectors left are the wrong size. Scalarize them.
677       .scalarize(0)
678       .scalarize(1);
679   }
680 
681   computeTables();
682   verify(*ST.getInstrInfo());
683 }
684 
685 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
686                                          MachineRegisterInfo &MRI,
687                                          MachineIRBuilder &MIRBuilder,
688                                          GISelChangeObserver &Observer) const {
689   switch (MI.getOpcode()) {
690   case TargetOpcode::G_ADDRSPACE_CAST:
691     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
692   case TargetOpcode::G_FRINT:
693     return legalizeFrint(MI, MRI, MIRBuilder);
694   case TargetOpcode::G_FCEIL:
695     return legalizeFceil(MI, MRI, MIRBuilder);
696   case TargetOpcode::G_INTRINSIC_TRUNC:
697     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
698   case TargetOpcode::G_SITOFP:
699     return legalizeITOFP(MI, MRI, MIRBuilder, true);
700   case TargetOpcode::G_UITOFP:
701     return legalizeITOFP(MI, MRI, MIRBuilder, false);
702   default:
703     return false;
704   }
705 
706   llvm_unreachable("expected switch to return");
707 }
708 
709 unsigned AMDGPULegalizerInfo::getSegmentAperture(
710   unsigned AS,
711   MachineRegisterInfo &MRI,
712   MachineIRBuilder &MIRBuilder) const {
713   MachineFunction &MF = MIRBuilder.getMF();
714   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
715   const LLT S32 = LLT::scalar(32);
716 
717   if (ST.hasApertureRegs()) {
718     // FIXME: Use inline constants (src_{shared, private}_base) instead of
719     // getreg.
720     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
721         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
722         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
723     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
724         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
725         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
726     unsigned Encoding =
727         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
728         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
729         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
730 
731     unsigned ApertureReg = MRI.createGenericVirtualRegister(S32);
732     unsigned GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
733 
734     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
735       .addDef(GetReg)
736       .addImm(Encoding);
737     MRI.setType(GetReg, S32);
738 
739     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
740     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
741       .addDef(ApertureReg)
742       .addUse(GetReg)
743       .addUse(ShiftAmt.getReg(0));
744 
745     return ApertureReg;
746   }
747 
748   unsigned QueuePtr = MRI.createGenericVirtualRegister(
749     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
750 
751   // FIXME: Placeholder until we can track the input registers.
752   MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
753 
754   // Offset into amd_queue_t for group_segment_aperture_base_hi /
755   // private_segment_aperture_base_hi.
756   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
757 
758   // FIXME: Don't use undef
759   Value *V = UndefValue::get(PointerType::get(
760                                Type::getInt8Ty(MF.getFunction().getContext()),
761                                AMDGPUAS::CONSTANT_ADDRESS));
762 
763   MachinePointerInfo PtrInfo(V, StructOffset);
764   MachineMemOperand *MMO = MF.getMachineMemOperand(
765     PtrInfo,
766     MachineMemOperand::MOLoad |
767     MachineMemOperand::MODereferenceable |
768     MachineMemOperand::MOInvariant,
769     4,
770     MinAlign(64, StructOffset));
771 
772   unsigned LoadResult = MRI.createGenericVirtualRegister(S32);
773   unsigned LoadAddr = AMDGPU::NoRegister;
774 
775   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
776   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
777   return LoadResult;
778 }
779 
780 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
781   MachineInstr &MI, MachineRegisterInfo &MRI,
782   MachineIRBuilder &MIRBuilder) const {
783   MachineFunction &MF = MIRBuilder.getMF();
784 
785   MIRBuilder.setInstr(MI);
786 
787   unsigned Dst = MI.getOperand(0).getReg();
788   unsigned Src = MI.getOperand(1).getReg();
789 
790   LLT DstTy = MRI.getType(Dst);
791   LLT SrcTy = MRI.getType(Src);
792   unsigned DestAS = DstTy.getAddressSpace();
793   unsigned SrcAS = SrcTy.getAddressSpace();
794 
795   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
796   // vector element.
797   assert(!DstTy.isVector());
798 
799   const AMDGPUTargetMachine &TM
800     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
801 
802   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
803   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
804     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
805     return true;
806   }
807 
808   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
809     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
810            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
811     unsigned NullVal = TM.getNullPointerValue(DestAS);
812 
813     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
814     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
815 
816     unsigned PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
817 
818     // Extract low 32-bits of the pointer.
819     MIRBuilder.buildExtract(PtrLo32, Src, 0);
820 
821     unsigned CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
822     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
823     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
824 
825     MI.eraseFromParent();
826     return true;
827   }
828 
829   assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
830          SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
831 
832   auto SegmentNull =
833       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
834   auto FlatNull =
835       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
836 
837   unsigned ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
838 
839   unsigned CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
840   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
841 
842   unsigned BuildPtr = MRI.createGenericVirtualRegister(DstTy);
843 
844   // Coerce the type of the low half of the result so we can use merge_values.
845   unsigned SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
846   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
847     .addDef(SrcAsInt)
848     .addUse(Src);
849 
850   // TODO: Should we allow mismatched types but matching sizes in merges to
851   // avoid the ptrtoint?
852   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
853   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
854 
855   MI.eraseFromParent();
856   return true;
857 }
858 
859 bool AMDGPULegalizerInfo::legalizeFrint(
860   MachineInstr &MI, MachineRegisterInfo &MRI,
861   MachineIRBuilder &MIRBuilder) const {
862   MIRBuilder.setInstr(MI);
863 
864   unsigned Src = MI.getOperand(1).getReg();
865   LLT Ty = MRI.getType(Src);
866   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
867 
868   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
869   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
870 
871   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
872   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
873 
874   // TODO: Should this propagate fast-math-flags?
875   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
876   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
877 
878   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
879   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
880 
881   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
882   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
883   return true;
884 }
885 
886 bool AMDGPULegalizerInfo::legalizeFceil(
887   MachineInstr &MI, MachineRegisterInfo &MRI,
888   MachineIRBuilder &B) const {
889   B.setInstr(MI);
890 
891   const LLT S1 = LLT::scalar(1);
892   const LLT S64 = LLT::scalar(64);
893 
894   unsigned Src = MI.getOperand(1).getReg();
895   assert(MRI.getType(Src) == S64);
896 
897   // result = trunc(src)
898   // if (src > 0.0 && src != result)
899   //   result += 1.0
900 
901   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
902 
903   const auto Zero = B.buildFConstant(S64, 0.0);
904   const auto One = B.buildFConstant(S64, 1.0);
905   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
906   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
907   auto And = B.buildAnd(S1, Lt0, NeTrunc);
908   auto Add = B.buildSelect(S64, And, One, Zero);
909 
910   // TODO: Should this propagate fast-math-flags?
911   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
912   return true;
913 }
914 
915 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
916                                               MachineIRBuilder &B) {
917   const unsigned FractBits = 52;
918   const unsigned ExpBits = 11;
919   LLT S32 = LLT::scalar(32);
920 
921   auto Const0 = B.buildConstant(S32, FractBits - 32);
922   auto Const1 = B.buildConstant(S32, ExpBits);
923 
924   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
925     .addUse(Const0.getReg(0))
926     .addUse(Const1.getReg(0));
927 
928   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
929 }
930 
931 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
932   MachineInstr &MI, MachineRegisterInfo &MRI,
933   MachineIRBuilder &B) const {
934   B.setInstr(MI);
935 
936   const LLT S1 = LLT::scalar(1);
937   const LLT S32 = LLT::scalar(32);
938   const LLT S64 = LLT::scalar(64);
939 
940   unsigned Src = MI.getOperand(1).getReg();
941   assert(MRI.getType(Src) == S64);
942 
943   // TODO: Should this use extract since the low half is unused?
944   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
945   unsigned Hi = Unmerge.getReg(1);
946 
947   // Extract the upper half, since this is where we will find the sign and
948   // exponent.
949   auto Exp = extractF64Exponent(Hi, B);
950 
951   const unsigned FractBits = 52;
952 
953   // Extract the sign bit.
954   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
955   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
956 
957   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
958 
959   const auto Zero32 = B.buildConstant(S32, 0);
960 
961   // Extend back to 64-bits.
962   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
963 
964   auto Shr = B.buildAShr(S64, FractMask, Exp);
965   auto Not = B.buildNot(S64, Shr);
966   auto Tmp0 = B.buildAnd(S64, Src, Not);
967   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
968 
969   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
970   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
971 
972   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
973   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
974   return true;
975 }
976 
977 bool AMDGPULegalizerInfo::legalizeITOFP(
978   MachineInstr &MI, MachineRegisterInfo &MRI,
979   MachineIRBuilder &B, bool Signed) const {
980   B.setInstr(MI);
981 
982   unsigned Dst = MI.getOperand(0).getReg();
983   unsigned Src = MI.getOperand(1).getReg();
984 
985   const LLT S64 = LLT::scalar(64);
986   const LLT S32 = LLT::scalar(32);
987 
988   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
989 
990   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
991 
992   auto CvtHi = Signed ?
993     B.buildSITOFP(S64, Unmerge.getReg(1)) :
994     B.buildUITOFP(S64, Unmerge.getReg(1));
995 
996   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
997 
998   auto ThirtyTwo = B.buildConstant(S32, 32);
999   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1000     .addUse(CvtHi.getReg(0))
1001     .addUse(ThirtyTwo.getReg(0));
1002 
1003   // TODO: Should this propagate fast-math-flags?
1004   B.buildFAdd(Dst, LdExp, CvtLo);
1005   MI.eraseFromParent();
1006   return true;
1007 }
1008