1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/TargetOpcodes.h"
21 #include "llvm/CodeGen/ValueTypes.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/Support/Debug.h"
25 
26 #define DEBUG_TYPE "amdgpu-legalinfo"
27 
28 using namespace llvm;
29 using namespace LegalizeActions;
30 using namespace LegalizeMutations;
31 using namespace LegalityPredicates;
32 
33 
34 static LegalityPredicate isMultiple32(unsigned TypeIdx,
35                                       unsigned MaxSize = 512) {
36   return [=](const LegalityQuery &Query) {
37     const LLT Ty = Query.Types[TypeIdx];
38     const LLT EltTy = Ty.getScalarType();
39     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
40   };
41 }
42 
43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     return Ty.isVector() &&
47            Ty.getNumElements() % 2 != 0 &&
48            Ty.getElementType().getSizeInBits() < 32;
49   };
50 }
51 
52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
53   return [=](const LegalityQuery &Query) {
54     const LLT Ty = Query.Types[TypeIdx];
55     const LLT EltTy = Ty.getElementType();
56     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
57   };
58 }
59 
60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getElementType();
64     unsigned Size = Ty.getSizeInBits();
65     unsigned Pieces = (Size + 63) / 64;
66     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
67     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
68   };
69 }
70 
71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
72   return [=](const LegalityQuery &Query) {
73     const LLT QueryTy = Query.Types[TypeIdx];
74     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
75   };
76 }
77 
78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
79   return [=](const LegalityQuery &Query) {
80     const LLT QueryTy = Query.Types[TypeIdx];
81     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
82   };
83 }
84 
85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
86 // v2s16.
87 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     if (Ty.isVector()) {
91       const int EltSize = Ty.getElementType().getSizeInBits();
92       return EltSize == 32 || EltSize == 64 ||
93             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
94              EltSize == 128 || EltSize == 256;
95     }
96 
97     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
98   };
99 }
100 
101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
102                                          const GCNTargetMachine &TM)
103   :  ST(ST_) {
104   using namespace TargetOpcode;
105 
106   auto GetAddrSpacePtr = [&TM](unsigned AS) {
107     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
108   };
109 
110   const LLT S1 = LLT::scalar(1);
111   const LLT S8 = LLT::scalar(8);
112   const LLT S16 = LLT::scalar(16);
113   const LLT S32 = LLT::scalar(32);
114   const LLT S64 = LLT::scalar(64);
115   const LLT S128 = LLT::scalar(128);
116   const LLT S256 = LLT::scalar(256);
117   const LLT S512 = LLT::scalar(512);
118 
119   const LLT V2S16 = LLT::vector(2, 16);
120   const LLT V4S16 = LLT::vector(4, 16);
121 
122   const LLT V2S32 = LLT::vector(2, 32);
123   const LLT V3S32 = LLT::vector(3, 32);
124   const LLT V4S32 = LLT::vector(4, 32);
125   const LLT V5S32 = LLT::vector(5, 32);
126   const LLT V6S32 = LLT::vector(6, 32);
127   const LLT V7S32 = LLT::vector(7, 32);
128   const LLT V8S32 = LLT::vector(8, 32);
129   const LLT V9S32 = LLT::vector(9, 32);
130   const LLT V10S32 = LLT::vector(10, 32);
131   const LLT V11S32 = LLT::vector(11, 32);
132   const LLT V12S32 = LLT::vector(12, 32);
133   const LLT V13S32 = LLT::vector(13, 32);
134   const LLT V14S32 = LLT::vector(14, 32);
135   const LLT V15S32 = LLT::vector(15, 32);
136   const LLT V16S32 = LLT::vector(16, 32);
137 
138   const LLT V2S64 = LLT::vector(2, 64);
139   const LLT V3S64 = LLT::vector(3, 64);
140   const LLT V4S64 = LLT::vector(4, 64);
141   const LLT V5S64 = LLT::vector(5, 64);
142   const LLT V6S64 = LLT::vector(6, 64);
143   const LLT V7S64 = LLT::vector(7, 64);
144   const LLT V8S64 = LLT::vector(8, 64);
145 
146   std::initializer_list<LLT> AllS32Vectors =
147     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
148      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
149   std::initializer_list<LLT> AllS64Vectors =
150     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
151 
152   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
153   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
154   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
155   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
156   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
157   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
158   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
159 
160   const LLT CodePtr = FlatPtr;
161 
162   const std::initializer_list<LLT> AddrSpaces64 = {
163     GlobalPtr, ConstantPtr, FlatPtr
164   };
165 
166   const std::initializer_list<LLT> AddrSpaces32 = {
167     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
168   };
169 
170   const std::initializer_list<LLT> FPTypesBase = {
171     S32, S64
172   };
173 
174   const std::initializer_list<LLT> FPTypes16 = {
175     S32, S64, S16
176   };
177 
178   const std::initializer_list<LLT> FPTypesPK16 = {
179     S32, S64, S16, V2S16
180   };
181 
182   setAction({G_BRCOND, S1}, Legal);
183 
184   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
185   // elements for v3s16
186   getActionDefinitionsBuilder(G_PHI)
187     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
188     .legalFor(AllS32Vectors)
189     .legalFor(AllS64Vectors)
190     .legalFor(AddrSpaces64)
191     .legalFor(AddrSpaces32)
192     .clampScalar(0, S32, S256)
193     .widenScalarToNextPow2(0, 32)
194     .clampMaxNumElements(0, S32, 16)
195     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
196     .legalIf(isPointer(0));
197 
198   if (ST.has16BitInsts()) {
199     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
200       .legalFor({S32, S16})
201       .clampScalar(0, S16, S32)
202       .scalarize(0);
203   } else {
204     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
205       .legalFor({S32})
206       .clampScalar(0, S32, S32)
207       .scalarize(0);
208   }
209 
210   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
211     .legalFor({S32})
212     .clampScalar(0, S32, S32)
213     .scalarize(0);
214 
215   // Report legal for any types we can handle anywhere. For the cases only legal
216   // on the SALU, RegBankSelect will be able to re-legalize.
217   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
218     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
219     .clampScalar(0, S32, S64)
220     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
221     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
222     .widenScalarToNextPow2(0)
223     .scalarize(0);
224 
225   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
226                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
227     .legalFor({{S32, S1}})
228     .clampScalar(0, S32, S32);
229 
230   getActionDefinitionsBuilder(G_BITCAST)
231     .legalForCartesianProduct({S32, V2S16})
232     .legalForCartesianProduct({S64, V2S32, V4S16})
233     .legalForCartesianProduct({V2S64, V4S32})
234     // Don't worry about the size constraint.
235     .legalIf(all(isPointer(0), isPointer(1)));
236 
237   if (ST.has16BitInsts()) {
238     getActionDefinitionsBuilder(G_FCONSTANT)
239       .legalFor({S32, S64, S16})
240       .clampScalar(0, S16, S64);
241   } else {
242     getActionDefinitionsBuilder(G_FCONSTANT)
243       .legalFor({S32, S64})
244       .clampScalar(0, S32, S64);
245   }
246 
247   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
248     .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
249                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
250     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
251     .clampScalarOrElt(0, S32, S512)
252     .legalIf(isMultiple32(0))
253     .widenScalarToNextPow2(0, 32)
254     .clampMaxNumElements(0, S32, 16);
255 
256 
257   // FIXME: i1 operands to intrinsics should always be legal, but other i1
258   // values may not be legal.  We need to figure out how to distinguish
259   // between these two scenarios.
260   getActionDefinitionsBuilder(G_CONSTANT)
261     .legalFor({S1, S32, S64, GlobalPtr,
262                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
263     .clampScalar(0, S32, S64)
264     .widenScalarToNextPow2(0)
265     .legalIf(isPointer(0));
266 
267   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
268 
269   auto &FPOpActions = getActionDefinitionsBuilder(
270     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
271     .legalFor({S32, S64});
272 
273   if (ST.has16BitInsts()) {
274     if (ST.hasVOP3PInsts())
275       FPOpActions.legalFor({S16, V2S16});
276     else
277       FPOpActions.legalFor({S16});
278   }
279 
280   auto &MinNumMaxNum = getActionDefinitionsBuilder({
281       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
282 
283   if (ST.hasVOP3PInsts()) {
284     MinNumMaxNum.customFor(FPTypesPK16)
285       .clampMaxNumElements(0, S16, 2)
286       .clampScalar(0, S16, S64)
287       .scalarize(0);
288   } else if (ST.has16BitInsts()) {
289     MinNumMaxNum.customFor(FPTypes16)
290       .clampScalar(0, S16, S64)
291       .scalarize(0);
292   } else {
293     MinNumMaxNum.customFor(FPTypesBase)
294       .clampScalar(0, S32, S64)
295       .scalarize(0);
296   }
297 
298   // TODO: Implement
299   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
300 
301   if (ST.hasVOP3PInsts())
302     FPOpActions.clampMaxNumElements(0, S16, 2);
303   FPOpActions
304     .scalarize(0)
305     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
306 
307   if (ST.has16BitInsts()) {
308     getActionDefinitionsBuilder(G_FSQRT)
309       .legalFor({S32, S64, S16})
310       .scalarize(0)
311       .clampScalar(0, S16, S64);
312   } else {
313     getActionDefinitionsBuilder(G_FSQRT)
314       .legalFor({S32, S64})
315       .scalarize(0)
316       .clampScalar(0, S32, S64);
317   }
318 
319   getActionDefinitionsBuilder(G_FPTRUNC)
320     .legalFor({{S32, S64}, {S16, S32}})
321     .scalarize(0);
322 
323   getActionDefinitionsBuilder(G_FPEXT)
324     .legalFor({{S64, S32}, {S32, S16}})
325     .lowerFor({{S64, S16}}) // FIXME: Implement
326     .scalarize(0);
327 
328   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
329   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
330 
331   getActionDefinitionsBuilder(G_FSUB)
332       // Use actual fsub instruction
333       .legalFor({S32})
334       // Must use fadd + fneg
335       .lowerFor({S64, S16, V2S16})
336       .scalarize(0)
337       .clampScalar(0, S32, S64);
338 
339   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
340     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
341                {S32, S1}, {S64, S1}, {S16, S1},
342                // FIXME: Hack
343                {S64, LLT::scalar(33)},
344                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
345     .scalarize(0);
346 
347   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
348     .legalFor({{S32, S32}, {S64, S32}})
349     .lowerFor({{S32, S64}})
350     .customFor({{S64, S64}})
351     .scalarize(0);
352 
353   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
354     .legalFor({{S32, S32}, {S32, S64}})
355     .scalarize(0);
356 
357   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
358     .legalFor({S32, S64})
359     .scalarize(0);
360 
361   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
362     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
363       .legalFor({S32, S64})
364       .clampScalar(0, S32, S64)
365       .scalarize(0);
366   } else {
367     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
368       .legalFor({S32})
369       .customFor({S64})
370       .clampScalar(0, S32, S64)
371       .scalarize(0);
372   }
373 
374   getActionDefinitionsBuilder(G_GEP)
375     .legalForCartesianProduct(AddrSpaces64, {S64})
376     .legalForCartesianProduct(AddrSpaces32, {S32})
377     .scalarize(0);
378 
379   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
380 
381   auto &CmpBuilder =
382     getActionDefinitionsBuilder(G_ICMP)
383     .legalForCartesianProduct(
384       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
385     .legalFor({{S1, S32}, {S1, S64}});
386   if (ST.has16BitInsts()) {
387     CmpBuilder.legalFor({{S1, S16}});
388   }
389 
390   CmpBuilder
391     .widenScalarToNextPow2(1)
392     .clampScalar(1, S32, S64)
393     .scalarize(0)
394     .legalIf(all(typeIs(0, S1), isPointer(1)));
395 
396   getActionDefinitionsBuilder(G_FCMP)
397     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
398     .widenScalarToNextPow2(1)
399     .clampScalar(1, S32, S64)
400     .scalarize(0);
401 
402   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
403   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
404                                G_FLOG, G_FLOG2, G_FLOG10})
405     .legalFor({S32})
406     .scalarize(0);
407 
408   // The 64-bit versions produce 32-bit results, but only on the SALU.
409   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
410                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
411                                G_CTPOP})
412     .legalFor({{S32, S32}, {S32, S64}})
413     .clampScalar(0, S32, S32)
414     .clampScalar(1, S32, S64)
415     .scalarize(0)
416     .widenScalarToNextPow2(0, 32)
417     .widenScalarToNextPow2(1, 32);
418 
419   // TODO: Expand for > s32
420   getActionDefinitionsBuilder(G_BSWAP)
421     .legalFor({S32})
422     .clampScalar(0, S32, S32)
423     .scalarize(0);
424 
425   if (ST.has16BitInsts()) {
426     if (ST.hasVOP3PInsts()) {
427       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
428         .legalFor({S32, S16, V2S16})
429         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
430         .clampMaxNumElements(0, S16, 2)
431         .clampScalar(0, S16, S32)
432         .widenScalarToNextPow2(0)
433         .scalarize(0);
434     } else {
435       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
436         .legalFor({S32, S16})
437         .widenScalarToNextPow2(0)
438         .clampScalar(0, S16, S32)
439         .scalarize(0);
440     }
441   } else {
442     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
443       .legalFor({S32})
444       .clampScalar(0, S32, S32)
445       .widenScalarToNextPow2(0)
446       .scalarize(0);
447   }
448 
449   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
450     return [=](const LegalityQuery &Query) {
451       return Query.Types[TypeIdx0].getSizeInBits() <
452              Query.Types[TypeIdx1].getSizeInBits();
453     };
454   };
455 
456   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
457     return [=](const LegalityQuery &Query) {
458       return Query.Types[TypeIdx0].getSizeInBits() >
459              Query.Types[TypeIdx1].getSizeInBits();
460     };
461   };
462 
463   getActionDefinitionsBuilder(G_INTTOPTR)
464     // List the common cases
465     .legalForCartesianProduct(AddrSpaces64, {S64})
466     .legalForCartesianProduct(AddrSpaces32, {S32})
467     .scalarize(0)
468     // Accept any address space as long as the size matches
469     .legalIf(sameSize(0, 1))
470     .widenScalarIf(smallerThan(1, 0),
471       [](const LegalityQuery &Query) {
472         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
473       })
474     .narrowScalarIf(greaterThan(1, 0),
475       [](const LegalityQuery &Query) {
476         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
477       });
478 
479   getActionDefinitionsBuilder(G_PTRTOINT)
480     // List the common cases
481     .legalForCartesianProduct(AddrSpaces64, {S64})
482     .legalForCartesianProduct(AddrSpaces32, {S32})
483     .scalarize(0)
484     // Accept any address space as long as the size matches
485     .legalIf(sameSize(0, 1))
486     .widenScalarIf(smallerThan(0, 1),
487       [](const LegalityQuery &Query) {
488         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
489       })
490     .narrowScalarIf(
491       greaterThan(0, 1),
492       [](const LegalityQuery &Query) {
493         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
494       });
495 
496   if (ST.hasFlatAddressSpace()) {
497     getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
498       .scalarize(0)
499       .custom();
500   }
501 
502   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
503   // handle some operations by just promoting the register during
504   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
505   getActionDefinitionsBuilder({G_LOAD, G_STORE})
506     .narrowScalarIf([](const LegalityQuery &Query) {
507         unsigned Size = Query.Types[0].getSizeInBits();
508         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
509         return (Size > 32 && MemSize < Size);
510       },
511       [](const LegalityQuery &Query) {
512         return std::make_pair(0, LLT::scalar(32));
513       })
514     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
515     .fewerElementsIf([=](const LegalityQuery &Query) {
516         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
517         return (MemSize == 96) &&
518                Query.Types[0].isVector() &&
519                !ST.hasDwordx3LoadStores();
520       },
521       [=](const LegalityQuery &Query) {
522         return std::make_pair(0, V2S32);
523       })
524     .legalIf([=](const LegalityQuery &Query) {
525         const LLT &Ty0 = Query.Types[0];
526 
527         unsigned Size = Ty0.getSizeInBits();
528         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
529         if (Size < 32 || (Size > 32 && MemSize < Size))
530           return false;
531 
532         if (Ty0.isVector() && Size != MemSize)
533           return false;
534 
535         // TODO: Decompose private loads into 4-byte components.
536         // TODO: Illegal flat loads on SI
537         switch (MemSize) {
538         case 8:
539         case 16:
540           return Size == 32;
541         case 32:
542         case 64:
543         case 128:
544           return true;
545 
546         case 96:
547           return ST.hasDwordx3LoadStores();
548 
549         case 256:
550         case 512:
551           // TODO: Possibly support loads of i256 and i512 .  This will require
552           // adding i256 and i512 types to MVT in order for to be able to use
553           // TableGen.
554           // TODO: Add support for other vector types, this will require
555           //       defining more value mappings for the new types.
556           return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
557                                     Ty0.getScalarType().getSizeInBits() == 64);
558 
559         default:
560           return false;
561         }
562       })
563     .clampScalar(0, S32, S64);
564 
565 
566   // FIXME: Handle alignment requirements.
567   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
568     .legalForTypesWithMemDesc({
569         {S32, GlobalPtr, 8, 8},
570         {S32, GlobalPtr, 16, 8},
571         {S32, LocalPtr, 8, 8},
572         {S32, LocalPtr, 16, 8},
573         {S32, PrivatePtr, 8, 8},
574         {S32, PrivatePtr, 16, 8}});
575   if (ST.hasFlatAddressSpace()) {
576     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
577                                        {S32, FlatPtr, 16, 8}});
578   }
579 
580   ExtLoads.clampScalar(0, S32, S32)
581           .widenScalarToNextPow2(0)
582           .unsupportedIfMemSizeNotPow2()
583           .lower();
584 
585   auto &Atomics = getActionDefinitionsBuilder(
586     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
587      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
588      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
589      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
590     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
591                {S64, GlobalPtr}, {S64, LocalPtr}});
592   if (ST.hasFlatAddressSpace()) {
593     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
594   }
595 
596   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
597     .legalFor({{S32, LocalPtr}});
598 
599   // TODO: Pointer types, any 32-bit or 64-bit vector
600   getActionDefinitionsBuilder(G_SELECT)
601     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
602           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
603           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
604     .clampScalar(0, S16, S64)
605     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
606     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
607     .scalarize(1)
608     .clampMaxNumElements(0, S32, 2)
609     .clampMaxNumElements(0, LocalPtr, 2)
610     .clampMaxNumElements(0, PrivatePtr, 2)
611     .scalarize(0)
612     .widenScalarToNextPow2(0)
613     .legalIf(all(isPointer(0), typeIs(1, S1)));
614 
615   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
616   // be more flexible with the shift amount type.
617   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
618     .legalFor({{S32, S32}, {S64, S32}});
619   if (ST.has16BitInsts()) {
620     if (ST.hasVOP3PInsts()) {
621       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
622             .clampMaxNumElements(0, S16, 2);
623     } else
624       Shifts.legalFor({{S16, S32}, {S16, S16}});
625 
626     Shifts.clampScalar(1, S16, S32);
627     Shifts.clampScalar(0, S16, S64);
628     Shifts.widenScalarToNextPow2(0, 16);
629   } else {
630     // Make sure we legalize the shift amount type first, as the general
631     // expansion for the shifted type will produce much worse code if it hasn't
632     // been truncated already.
633     Shifts.clampScalar(1, S32, S32);
634     Shifts.clampScalar(0, S32, S64);
635     Shifts.widenScalarToNextPow2(0, 32);
636   }
637   Shifts.scalarize(0);
638 
639   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
640     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
641     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
642     unsigned IdxTypeIdx = 2;
643 
644     getActionDefinitionsBuilder(Op)
645       .customIf([=](const LegalityQuery &Query) {
646           const LLT EltTy = Query.Types[EltTypeIdx];
647           const LLT VecTy = Query.Types[VecTypeIdx];
648           const LLT IdxTy = Query.Types[IdxTypeIdx];
649           return (EltTy.getSizeInBits() == 16 ||
650                   EltTy.getSizeInBits() % 32 == 0) &&
651                  VecTy.getSizeInBits() % 32 == 0 &&
652                  VecTy.getSizeInBits() <= 512 &&
653                  IdxTy.getSizeInBits() == 32;
654         })
655       .clampScalar(EltTypeIdx, S32, S64)
656       .clampScalar(VecTypeIdx, S32, S64)
657       .clampScalar(IdxTypeIdx, S32, S32);
658   }
659 
660   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
661     .unsupportedIf([=](const LegalityQuery &Query) {
662         const LLT &EltTy = Query.Types[1].getElementType();
663         return Query.Types[0] != EltTy;
664       });
665 
666   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
667     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
668     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
669 
670     // FIXME: Doesn't handle extract of illegal sizes.
671     getActionDefinitionsBuilder(Op)
672       .legalIf([=](const LegalityQuery &Query) {
673           const LLT BigTy = Query.Types[BigTyIdx];
674           const LLT LitTy = Query.Types[LitTyIdx];
675           return (BigTy.getSizeInBits() % 32 == 0) &&
676                  (LitTy.getSizeInBits() % 16 == 0);
677         })
678       .widenScalarIf(
679         [=](const LegalityQuery &Query) {
680           const LLT BigTy = Query.Types[BigTyIdx];
681           return (BigTy.getScalarSizeInBits() < 16);
682         },
683         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
684       .widenScalarIf(
685         [=](const LegalityQuery &Query) {
686           const LLT LitTy = Query.Types[LitTyIdx];
687           return (LitTy.getScalarSizeInBits() < 16);
688         },
689         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
690       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
691       .widenScalarToNextPow2(BigTyIdx, 32);
692 
693   }
694 
695   getActionDefinitionsBuilder(G_BUILD_VECTOR)
696       .legalForCartesianProduct(AllS32Vectors, {S32})
697       .legalForCartesianProduct(AllS64Vectors, {S64})
698       .clampNumElements(0, V16S32, V16S32)
699       .clampNumElements(0, V2S64, V8S64)
700       .minScalarSameAs(1, 0)
701       .legalIf(isRegisterType(0))
702       .minScalarOrElt(0, S32);
703 
704   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
705     .legalIf(isRegisterType(0));
706 
707   // Merge/Unmerge
708   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
709     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
710     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
711 
712     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
713       const LLT &Ty = Query.Types[TypeIdx];
714       if (Ty.isVector()) {
715         const LLT &EltTy = Ty.getElementType();
716         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
717           return true;
718         if (!isPowerOf2_32(EltTy.getSizeInBits()))
719           return true;
720       }
721       return false;
722     };
723 
724     getActionDefinitionsBuilder(Op)
725       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
726       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
727       // worth considering the multiples of 64 since 2*192 and 2*384 are not
728       // valid.
729       .clampScalar(LitTyIdx, S16, S256)
730       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
731 
732       // Break up vectors with weird elements into scalars
733       .fewerElementsIf(
734         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
735         scalarize(0))
736       .fewerElementsIf(
737         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
738         scalarize(1))
739       .clampScalar(BigTyIdx, S32, S512)
740       .lowerFor({{S16, V2S16}})
741       .widenScalarIf(
742         [=](const LegalityQuery &Query) {
743           const LLT &Ty = Query.Types[BigTyIdx];
744           return !isPowerOf2_32(Ty.getSizeInBits()) &&
745                  Ty.getSizeInBits() % 16 != 0;
746         },
747         [=](const LegalityQuery &Query) {
748           // Pick the next power of 2, or a multiple of 64 over 128.
749           // Whichever is smaller.
750           const LLT &Ty = Query.Types[BigTyIdx];
751           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
752           if (NewSizeInBits >= 256) {
753             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
754             if (RoundedTo < NewSizeInBits)
755               NewSizeInBits = RoundedTo;
756           }
757           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
758         })
759       .legalIf([=](const LegalityQuery &Query) {
760           const LLT &BigTy = Query.Types[BigTyIdx];
761           const LLT &LitTy = Query.Types[LitTyIdx];
762 
763           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
764             return false;
765           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
766             return false;
767 
768           return BigTy.getSizeInBits() % 16 == 0 &&
769                  LitTy.getSizeInBits() % 16 == 0 &&
770                  BigTy.getSizeInBits() <= 512;
771         })
772       // Any vectors left are the wrong size. Scalarize them.
773       .scalarize(0)
774       .scalarize(1);
775   }
776 
777   computeTables();
778   verify(*ST.getInstrInfo());
779 }
780 
781 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
782                                          MachineRegisterInfo &MRI,
783                                          MachineIRBuilder &MIRBuilder,
784                                          GISelChangeObserver &Observer) const {
785   switch (MI.getOpcode()) {
786   case TargetOpcode::G_ADDRSPACE_CAST:
787     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
788   case TargetOpcode::G_FRINT:
789     return legalizeFrint(MI, MRI, MIRBuilder);
790   case TargetOpcode::G_FCEIL:
791     return legalizeFceil(MI, MRI, MIRBuilder);
792   case TargetOpcode::G_INTRINSIC_TRUNC:
793     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
794   case TargetOpcode::G_SITOFP:
795     return legalizeITOFP(MI, MRI, MIRBuilder, true);
796   case TargetOpcode::G_UITOFP:
797     return legalizeITOFP(MI, MRI, MIRBuilder, false);
798   case TargetOpcode::G_FMINNUM:
799   case TargetOpcode::G_FMAXNUM:
800   case TargetOpcode::G_FMINNUM_IEEE:
801   case TargetOpcode::G_FMAXNUM_IEEE:
802     return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
803   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
804     return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
805   case TargetOpcode::G_INSERT_VECTOR_ELT:
806     return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
807   default:
808     return false;
809   }
810 
811   llvm_unreachable("expected switch to return");
812 }
813 
814 Register AMDGPULegalizerInfo::getSegmentAperture(
815   unsigned AS,
816   MachineRegisterInfo &MRI,
817   MachineIRBuilder &MIRBuilder) const {
818   MachineFunction &MF = MIRBuilder.getMF();
819   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
820   const LLT S32 = LLT::scalar(32);
821 
822   if (ST.hasApertureRegs()) {
823     // FIXME: Use inline constants (src_{shared, private}_base) instead of
824     // getreg.
825     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
826         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
827         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
828     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
829         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
830         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
831     unsigned Encoding =
832         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
833         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
834         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
835 
836     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
837     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
838 
839     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
840       .addDef(GetReg)
841       .addImm(Encoding);
842     MRI.setType(GetReg, S32);
843 
844     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
845     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
846       .addDef(ApertureReg)
847       .addUse(GetReg)
848       .addUse(ShiftAmt.getReg(0));
849 
850     return ApertureReg;
851   }
852 
853   Register QueuePtr = MRI.createGenericVirtualRegister(
854     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
855 
856   // FIXME: Placeholder until we can track the input registers.
857   MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
858 
859   // Offset into amd_queue_t for group_segment_aperture_base_hi /
860   // private_segment_aperture_base_hi.
861   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
862 
863   // FIXME: Don't use undef
864   Value *V = UndefValue::get(PointerType::get(
865                                Type::getInt8Ty(MF.getFunction().getContext()),
866                                AMDGPUAS::CONSTANT_ADDRESS));
867 
868   MachinePointerInfo PtrInfo(V, StructOffset);
869   MachineMemOperand *MMO = MF.getMachineMemOperand(
870     PtrInfo,
871     MachineMemOperand::MOLoad |
872     MachineMemOperand::MODereferenceable |
873     MachineMemOperand::MOInvariant,
874     4,
875     MinAlign(64, StructOffset));
876 
877   Register LoadResult = MRI.createGenericVirtualRegister(S32);
878   Register LoadAddr;
879 
880   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
881   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
882   return LoadResult;
883 }
884 
885 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
886   MachineInstr &MI, MachineRegisterInfo &MRI,
887   MachineIRBuilder &MIRBuilder) const {
888   MachineFunction &MF = MIRBuilder.getMF();
889 
890   MIRBuilder.setInstr(MI);
891 
892   Register Dst = MI.getOperand(0).getReg();
893   Register Src = MI.getOperand(1).getReg();
894 
895   LLT DstTy = MRI.getType(Dst);
896   LLT SrcTy = MRI.getType(Src);
897   unsigned DestAS = DstTy.getAddressSpace();
898   unsigned SrcAS = SrcTy.getAddressSpace();
899 
900   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
901   // vector element.
902   assert(!DstTy.isVector());
903 
904   const AMDGPUTargetMachine &TM
905     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
906 
907   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
908   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
909     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
910     return true;
911   }
912 
913   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
914     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
915            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
916     unsigned NullVal = TM.getNullPointerValue(DestAS);
917 
918     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
919     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
920 
921     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
922 
923     // Extract low 32-bits of the pointer.
924     MIRBuilder.buildExtract(PtrLo32, Src, 0);
925 
926     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
927     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
928     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
929 
930     MI.eraseFromParent();
931     return true;
932   }
933 
934   assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
935          SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
936 
937   auto SegmentNull =
938       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
939   auto FlatNull =
940       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
941 
942   Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
943 
944   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
945   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
946 
947   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
948 
949   // Coerce the type of the low half of the result so we can use merge_values.
950   Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
951   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
952     .addDef(SrcAsInt)
953     .addUse(Src);
954 
955   // TODO: Should we allow mismatched types but matching sizes in merges to
956   // avoid the ptrtoint?
957   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
958   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
959 
960   MI.eraseFromParent();
961   return true;
962 }
963 
964 bool AMDGPULegalizerInfo::legalizeFrint(
965   MachineInstr &MI, MachineRegisterInfo &MRI,
966   MachineIRBuilder &MIRBuilder) const {
967   MIRBuilder.setInstr(MI);
968 
969   Register Src = MI.getOperand(1).getReg();
970   LLT Ty = MRI.getType(Src);
971   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
972 
973   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
974   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
975 
976   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
977   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
978 
979   // TODO: Should this propagate fast-math-flags?
980   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
981   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
982 
983   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
984   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
985 
986   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
987   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
988   return true;
989 }
990 
991 bool AMDGPULegalizerInfo::legalizeFceil(
992   MachineInstr &MI, MachineRegisterInfo &MRI,
993   MachineIRBuilder &B) const {
994   B.setInstr(MI);
995 
996   const LLT S1 = LLT::scalar(1);
997   const LLT S64 = LLT::scalar(64);
998 
999   Register Src = MI.getOperand(1).getReg();
1000   assert(MRI.getType(Src) == S64);
1001 
1002   // result = trunc(src)
1003   // if (src > 0.0 && src != result)
1004   //   result += 1.0
1005 
1006   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1007 
1008   const auto Zero = B.buildFConstant(S64, 0.0);
1009   const auto One = B.buildFConstant(S64, 1.0);
1010   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1011   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1012   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1013   auto Add = B.buildSelect(S64, And, One, Zero);
1014 
1015   // TODO: Should this propagate fast-math-flags?
1016   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1017   return true;
1018 }
1019 
1020 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1021                                               MachineIRBuilder &B) {
1022   const unsigned FractBits = 52;
1023   const unsigned ExpBits = 11;
1024   LLT S32 = LLT::scalar(32);
1025 
1026   auto Const0 = B.buildConstant(S32, FractBits - 32);
1027   auto Const1 = B.buildConstant(S32, ExpBits);
1028 
1029   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1030     .addUse(Const0.getReg(0))
1031     .addUse(Const1.getReg(0));
1032 
1033   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1034 }
1035 
1036 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1037   MachineInstr &MI, MachineRegisterInfo &MRI,
1038   MachineIRBuilder &B) const {
1039   B.setInstr(MI);
1040 
1041   const LLT S1 = LLT::scalar(1);
1042   const LLT S32 = LLT::scalar(32);
1043   const LLT S64 = LLT::scalar(64);
1044 
1045   Register Src = MI.getOperand(1).getReg();
1046   assert(MRI.getType(Src) == S64);
1047 
1048   // TODO: Should this use extract since the low half is unused?
1049   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1050   Register Hi = Unmerge.getReg(1);
1051 
1052   // Extract the upper half, since this is where we will find the sign and
1053   // exponent.
1054   auto Exp = extractF64Exponent(Hi, B);
1055 
1056   const unsigned FractBits = 52;
1057 
1058   // Extract the sign bit.
1059   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1060   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1061 
1062   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1063 
1064   const auto Zero32 = B.buildConstant(S32, 0);
1065 
1066   // Extend back to 64-bits.
1067   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1068 
1069   auto Shr = B.buildAShr(S64, FractMask, Exp);
1070   auto Not = B.buildNot(S64, Shr);
1071   auto Tmp0 = B.buildAnd(S64, Src, Not);
1072   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1073 
1074   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1075   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1076 
1077   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1078   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1079   return true;
1080 }
1081 
1082 bool AMDGPULegalizerInfo::legalizeITOFP(
1083   MachineInstr &MI, MachineRegisterInfo &MRI,
1084   MachineIRBuilder &B, bool Signed) const {
1085   B.setInstr(MI);
1086 
1087   Register Dst = MI.getOperand(0).getReg();
1088   Register Src = MI.getOperand(1).getReg();
1089 
1090   const LLT S64 = LLT::scalar(64);
1091   const LLT S32 = LLT::scalar(32);
1092 
1093   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1094 
1095   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1096 
1097   auto CvtHi = Signed ?
1098     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1099     B.buildUITOFP(S64, Unmerge.getReg(1));
1100 
1101   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1102 
1103   auto ThirtyTwo = B.buildConstant(S32, 32);
1104   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1105     .addUse(CvtHi.getReg(0))
1106     .addUse(ThirtyTwo.getReg(0));
1107 
1108   // TODO: Should this propagate fast-math-flags?
1109   B.buildFAdd(Dst, LdExp, CvtLo);
1110   MI.eraseFromParent();
1111   return true;
1112 }
1113 
1114 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1115   MachineInstr &MI, MachineRegisterInfo &MRI,
1116   MachineIRBuilder &B) const {
1117   MachineFunction &MF = B.getMF();
1118   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1119 
1120   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1121                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1122 
1123   // With ieee_mode disabled, the instructions have the correct behavior
1124   // already for G_FMINNUM/G_FMAXNUM
1125   if (!MFI->getMode().IEEE)
1126     return !IsIEEEOp;
1127 
1128   if (IsIEEEOp)
1129     return true;
1130 
1131   MachineIRBuilder HelperBuilder(MI);
1132   GISelObserverWrapper DummyObserver;
1133   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1134   HelperBuilder.setMBB(*MI.getParent());
1135   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1136 }
1137 
1138 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1139   MachineInstr &MI, MachineRegisterInfo &MRI,
1140   MachineIRBuilder &B) const {
1141   // TODO: Should move some of this into LegalizerHelper.
1142 
1143   // TODO: Promote dynamic indexing of s16 to s32
1144   // TODO: Dynamic s64 indexing is only legal for SGPR.
1145   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1146   if (!IdxVal) // Dynamic case will be selected to register indexing.
1147     return true;
1148 
1149   Register Dst = MI.getOperand(0).getReg();
1150   Register Vec = MI.getOperand(1).getReg();
1151 
1152   LLT VecTy = MRI.getType(Vec);
1153   LLT EltTy = VecTy.getElementType();
1154   assert(EltTy == MRI.getType(Dst));
1155 
1156   B.setInstr(MI);
1157 
1158   if (IdxVal.getValue() < VecTy.getNumElements())
1159     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1160   else
1161     B.buildUndef(Dst);
1162 
1163   MI.eraseFromParent();
1164   return true;
1165 }
1166 
1167 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1168   MachineInstr &MI, MachineRegisterInfo &MRI,
1169   MachineIRBuilder &B) const {
1170   // TODO: Should move some of this into LegalizerHelper.
1171 
1172   // TODO: Promote dynamic indexing of s16 to s32
1173   // TODO: Dynamic s64 indexing is only legal for SGPR.
1174   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1175   if (!IdxVal) // Dynamic case will be selected to register indexing.
1176     return true;
1177 
1178   Register Dst = MI.getOperand(0).getReg();
1179   Register Vec = MI.getOperand(1).getReg();
1180   Register Ins = MI.getOperand(2).getReg();
1181 
1182   LLT VecTy = MRI.getType(Vec);
1183   LLT EltTy = VecTy.getElementType();
1184   assert(EltTy == MRI.getType(Ins));
1185 
1186   B.setInstr(MI);
1187 
1188   if (IdxVal.getValue() < VecTy.getNumElements())
1189     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1190   else
1191     B.buildUndef(Dst);
1192 
1193   MI.eraseFromParent();
1194   return true;
1195 }
1196 
1197 // Return the use branch instruction, otherwise null if the usage is invalid.
1198 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1199                                        MachineRegisterInfo &MRI) {
1200   Register CondDef = MI.getOperand(0).getReg();
1201   if (!MRI.hasOneNonDBGUse(CondDef))
1202     return nullptr;
1203 
1204   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1205   return UseMI.getParent() == MI.getParent() &&
1206     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1207 }
1208 
1209 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1210                                                 Register Reg, LLT Ty) const {
1211   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1212   if (LiveIn)
1213     return LiveIn;
1214 
1215   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1216   MRI.addLiveIn(Reg, NewReg);
1217   return NewReg;
1218 }
1219 
1220 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1221                                          const ArgDescriptor *Arg) const {
1222   if (!Arg->isRegister())
1223     return false; // TODO: Handle these
1224 
1225   assert(Arg->getRegister() != 0);
1226   assert(Arg->getRegister().isPhysical());
1227 
1228   MachineRegisterInfo &MRI = *B.getMRI();
1229 
1230   LLT Ty = MRI.getType(DstReg);
1231   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1232 
1233   if (Arg->isMasked()) {
1234     // TODO: Should we try to emit this once in the entry block?
1235     const LLT S32 = LLT::scalar(32);
1236     const unsigned Mask = Arg->getMask();
1237     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1238 
1239     auto ShiftAmt = B.buildConstant(S32, Shift);
1240     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1241     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1242   } else
1243     B.buildCopy(DstReg, LiveIn);
1244 
1245   // Insert the argument copy if it doens't already exist.
1246   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1247   if (!MRI.getVRegDef(LiveIn)) {
1248     MachineBasicBlock &EntryMBB = B.getMF().front();
1249     EntryMBB.addLiveIn(Arg->getRegister());
1250     B.setInsertPt(EntryMBB, EntryMBB.begin());
1251     B.buildCopy(LiveIn, Arg->getRegister());
1252   }
1253 
1254   return true;
1255 }
1256 
1257 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1258   MachineInstr &MI,
1259   MachineRegisterInfo &MRI,
1260   MachineIRBuilder &B,
1261   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1262   B.setInstr(MI);
1263 
1264   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1265 
1266   const ArgDescriptor *Arg;
1267   const TargetRegisterClass *RC;
1268   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1269   if (!Arg) {
1270     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1271     return false;
1272   }
1273 
1274   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1275     MI.eraseFromParent();
1276     return true;
1277   }
1278 
1279   return false;
1280 }
1281 
1282 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1283                                            MachineRegisterInfo &MRI,
1284                                            MachineIRBuilder &B) const {
1285   B.setInstr(MI);
1286   Register Res = MI.getOperand(0).getReg();
1287   Register LHS = MI.getOperand(2).getReg();
1288   Register RHS = MI.getOperand(3).getReg();
1289   uint16_t Flags = MI.getFlags();
1290 
1291   LLT S32 = LLT::scalar(32);
1292   LLT S1 = LLT::scalar(1);
1293 
1294   auto Abs = B.buildFAbs(S32, RHS, Flags);
1295   const APFloat C0Val(1.0f);
1296 
1297   auto C0 = B.buildConstant(S32, 0x6f800000);
1298   auto C1 = B.buildConstant(S32, 0x2f800000);
1299   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1300 
1301   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1302   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1303 
1304   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1305 
1306   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1307     .addUse(Mul0.getReg(0))
1308     .setMIFlags(Flags);
1309 
1310   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1311 
1312   B.buildFMul(Res, Sel, Mul1, Flags);
1313 
1314   MI.eraseFromParent();
1315   return true;
1316 }
1317 
1318 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1319                                                  MachineRegisterInfo &MRI,
1320                                                  MachineIRBuilder &B) const {
1321   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1322   if (!MFI->isEntryFunction()) {
1323     return legalizePreloadedArgIntrin(MI, MRI, B,
1324                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1325   }
1326 
1327   B.setInstr(MI);
1328 
1329   uint64_t Offset =
1330     ST.getTargetLowering()->getImplicitParameterOffset(
1331       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1332   Register DstReg = MI.getOperand(0).getReg();
1333   LLT DstTy = MRI.getType(DstReg);
1334   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1335 
1336   const ArgDescriptor *Arg;
1337   const TargetRegisterClass *RC;
1338   std::tie(Arg, RC)
1339     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1340   if (!Arg)
1341     return false;
1342 
1343   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1344   if (!loadInputValue(KernargPtrReg, B, Arg))
1345     return false;
1346 
1347   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1348   MI.eraseFromParent();
1349   return true;
1350 }
1351 
1352 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1353                                             MachineRegisterInfo &MRI,
1354                                             MachineIRBuilder &B) const {
1355   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1356   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1357   case Intrinsic::amdgcn_if: {
1358     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1359       const SIRegisterInfo *TRI
1360         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1361 
1362       B.setInstr(*BrCond);
1363       Register Def = MI.getOperand(1).getReg();
1364       Register Use = MI.getOperand(3).getReg();
1365       B.buildInstr(AMDGPU::SI_IF)
1366         .addDef(Def)
1367         .addUse(Use)
1368         .addMBB(BrCond->getOperand(1).getMBB());
1369 
1370       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1371       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1372       MI.eraseFromParent();
1373       BrCond->eraseFromParent();
1374       return true;
1375     }
1376 
1377     return false;
1378   }
1379   case Intrinsic::amdgcn_loop: {
1380     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1381       const SIRegisterInfo *TRI
1382         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1383 
1384       B.setInstr(*BrCond);
1385       Register Reg = MI.getOperand(2).getReg();
1386       B.buildInstr(AMDGPU::SI_LOOP)
1387         .addUse(Reg)
1388         .addMBB(BrCond->getOperand(1).getMBB());
1389       MI.eraseFromParent();
1390       BrCond->eraseFromParent();
1391       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1392       return true;
1393     }
1394 
1395     return false;
1396   }
1397   case Intrinsic::amdgcn_kernarg_segment_ptr:
1398     return legalizePreloadedArgIntrin(
1399       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1400   case Intrinsic::amdgcn_implicitarg_ptr:
1401     return legalizeImplicitArgPtr(MI, MRI, B);
1402   case Intrinsic::amdgcn_workitem_id_x:
1403     return legalizePreloadedArgIntrin(MI, MRI, B,
1404                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1405   case Intrinsic::amdgcn_workitem_id_y:
1406     return legalizePreloadedArgIntrin(MI, MRI, B,
1407                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1408   case Intrinsic::amdgcn_workitem_id_z:
1409     return legalizePreloadedArgIntrin(MI, MRI, B,
1410                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1411   case Intrinsic::amdgcn_workgroup_id_x:
1412     return legalizePreloadedArgIntrin(MI, MRI, B,
1413                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1414   case Intrinsic::amdgcn_workgroup_id_y:
1415     return legalizePreloadedArgIntrin(MI, MRI, B,
1416                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1417   case Intrinsic::amdgcn_workgroup_id_z:
1418     return legalizePreloadedArgIntrin(MI, MRI, B,
1419                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1420   case Intrinsic::amdgcn_dispatch_ptr:
1421     return legalizePreloadedArgIntrin(MI, MRI, B,
1422                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1423   case Intrinsic::amdgcn_queue_ptr:
1424     return legalizePreloadedArgIntrin(MI, MRI, B,
1425                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1426   case Intrinsic::amdgcn_implicit_buffer_ptr:
1427     return legalizePreloadedArgIntrin(
1428       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1429   case Intrinsic::amdgcn_dispatch_id:
1430     return legalizePreloadedArgIntrin(MI, MRI, B,
1431                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1432   case Intrinsic::amdgcn_fdiv_fast:
1433     return legalizeFDIVFast(MI, MRI, B);
1434   default:
1435     return true;
1436   }
1437 
1438   return true;
1439 }
1440