1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/TargetOpcodes.h"
21 #include "llvm/CodeGen/ValueTypes.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/Support/Debug.h"
25 
26 #define DEBUG_TYPE "amdgpu-legalinfo"
27 
28 using namespace llvm;
29 using namespace LegalizeActions;
30 using namespace LegalizeMutations;
31 using namespace LegalityPredicates;
32 
33 
34 static LegalityPredicate isMultiple32(unsigned TypeIdx,
35                                       unsigned MaxSize = 512) {
36   return [=](const LegalityQuery &Query) {
37     const LLT Ty = Query.Types[TypeIdx];
38     const LLT EltTy = Ty.getScalarType();
39     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
40   };
41 }
42 
43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     return Ty.isVector() &&
47            Ty.getNumElements() % 2 != 0 &&
48            Ty.getElementType().getSizeInBits() < 32;
49   };
50 }
51 
52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
53   return [=](const LegalityQuery &Query) {
54     const LLT Ty = Query.Types[TypeIdx];
55     const LLT EltTy = Ty.getElementType();
56     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
57   };
58 }
59 
60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getElementType();
64     unsigned Size = Ty.getSizeInBits();
65     unsigned Pieces = (Size + 63) / 64;
66     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
67     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
68   };
69 }
70 
71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
72   return [=](const LegalityQuery &Query) {
73     const LLT QueryTy = Query.Types[TypeIdx];
74     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
75   };
76 }
77 
78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
79   return [=](const LegalityQuery &Query) {
80     const LLT QueryTy = Query.Types[TypeIdx];
81     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
82   };
83 }
84 
85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
86 // v2s16.
87 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     if (Ty.isVector()) {
91       const int EltSize = Ty.getElementType().getSizeInBits();
92       return EltSize == 32 || EltSize == 64 ||
93             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
94              EltSize == 128 || EltSize == 256;
95     }
96 
97     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
98   };
99 }
100 
101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
102                                          const GCNTargetMachine &TM)
103   :  ST(ST_) {
104   using namespace TargetOpcode;
105 
106   auto GetAddrSpacePtr = [&TM](unsigned AS) {
107     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
108   };
109 
110   const LLT S1 = LLT::scalar(1);
111   const LLT S8 = LLT::scalar(8);
112   const LLT S16 = LLT::scalar(16);
113   const LLT S32 = LLT::scalar(32);
114   const LLT S64 = LLT::scalar(64);
115   const LLT S128 = LLT::scalar(128);
116   const LLT S256 = LLT::scalar(256);
117   const LLT S512 = LLT::scalar(512);
118 
119   const LLT V2S16 = LLT::vector(2, 16);
120   const LLT V4S16 = LLT::vector(4, 16);
121 
122   const LLT V2S32 = LLT::vector(2, 32);
123   const LLT V3S32 = LLT::vector(3, 32);
124   const LLT V4S32 = LLT::vector(4, 32);
125   const LLT V5S32 = LLT::vector(5, 32);
126   const LLT V6S32 = LLT::vector(6, 32);
127   const LLT V7S32 = LLT::vector(7, 32);
128   const LLT V8S32 = LLT::vector(8, 32);
129   const LLT V9S32 = LLT::vector(9, 32);
130   const LLT V10S32 = LLT::vector(10, 32);
131   const LLT V11S32 = LLT::vector(11, 32);
132   const LLT V12S32 = LLT::vector(12, 32);
133   const LLT V13S32 = LLT::vector(13, 32);
134   const LLT V14S32 = LLT::vector(14, 32);
135   const LLT V15S32 = LLT::vector(15, 32);
136   const LLT V16S32 = LLT::vector(16, 32);
137 
138   const LLT V2S64 = LLT::vector(2, 64);
139   const LLT V3S64 = LLT::vector(3, 64);
140   const LLT V4S64 = LLT::vector(4, 64);
141   const LLT V5S64 = LLT::vector(5, 64);
142   const LLT V6S64 = LLT::vector(6, 64);
143   const LLT V7S64 = LLT::vector(7, 64);
144   const LLT V8S64 = LLT::vector(8, 64);
145 
146   std::initializer_list<LLT> AllS32Vectors =
147     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
148      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
149   std::initializer_list<LLT> AllS64Vectors =
150     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
151 
152   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
153   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
154   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
155   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
156   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
157   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
158   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
159 
160   const LLT CodePtr = FlatPtr;
161 
162   const std::initializer_list<LLT> AddrSpaces64 = {
163     GlobalPtr, ConstantPtr, FlatPtr
164   };
165 
166   const std::initializer_list<LLT> AddrSpaces32 = {
167     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
168   };
169 
170   const std::initializer_list<LLT> FPTypesBase = {
171     S32, S64
172   };
173 
174   const std::initializer_list<LLT> FPTypes16 = {
175     S32, S64, S16
176   };
177 
178   const std::initializer_list<LLT> FPTypesPK16 = {
179     S32, S64, S16, V2S16
180   };
181 
182   setAction({G_BRCOND, S1}, Legal);
183 
184   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
185   // elements for v3s16
186   getActionDefinitionsBuilder(G_PHI)
187     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
188     .legalFor(AllS32Vectors)
189     .legalFor(AllS64Vectors)
190     .legalFor(AddrSpaces64)
191     .legalFor(AddrSpaces32)
192     .clampScalar(0, S32, S256)
193     .widenScalarToNextPow2(0, 32)
194     .clampMaxNumElements(0, S32, 16)
195     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
196     .legalIf(isPointer(0));
197 
198   if (ST.has16BitInsts()) {
199     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
200       .legalFor({S32, S16})
201       .clampScalar(0, S16, S32)
202       .scalarize(0);
203   } else {
204     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
205       .legalFor({S32})
206       .clampScalar(0, S32, S32)
207       .scalarize(0);
208   }
209 
210   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
211     .legalFor({S32})
212     .clampScalar(0, S32, S32)
213     .scalarize(0);
214 
215   // Report legal for any types we can handle anywhere. For the cases only legal
216   // on the SALU, RegBankSelect will be able to re-legalize.
217   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
218     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
219     .clampScalar(0, S32, S64)
220     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
221     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
222     .widenScalarToNextPow2(0)
223     .scalarize(0);
224 
225   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
226                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
227     .legalFor({{S32, S1}})
228     .clampScalar(0, S32, S32);
229 
230   getActionDefinitionsBuilder(G_BITCAST)
231     .legalForCartesianProduct({S32, V2S16})
232     .legalForCartesianProduct({S64, V2S32, V4S16})
233     .legalForCartesianProduct({V2S64, V4S32})
234     // Don't worry about the size constraint.
235     .legalIf(all(isPointer(0), isPointer(1)));
236 
237   if (ST.has16BitInsts()) {
238     getActionDefinitionsBuilder(G_FCONSTANT)
239       .legalFor({S32, S64, S16})
240       .clampScalar(0, S16, S64);
241   } else {
242     getActionDefinitionsBuilder(G_FCONSTANT)
243       .legalFor({S32, S64})
244       .clampScalar(0, S32, S64);
245   }
246 
247   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
248     .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
249                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
250     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
251     .clampScalarOrElt(0, S32, S512)
252     .legalIf(isMultiple32(0))
253     .widenScalarToNextPow2(0, 32)
254     .clampMaxNumElements(0, S32, 16);
255 
256 
257   // FIXME: i1 operands to intrinsics should always be legal, but other i1
258   // values may not be legal.  We need to figure out how to distinguish
259   // between these two scenarios.
260   getActionDefinitionsBuilder(G_CONSTANT)
261     .legalFor({S1, S32, S64, GlobalPtr,
262                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
263     .clampScalar(0, S32, S64)
264     .widenScalarToNextPow2(0)
265     .legalIf(isPointer(0));
266 
267   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
268 
269   auto &FPOpActions = getActionDefinitionsBuilder(
270     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
271     .legalFor({S32, S64});
272 
273   if (ST.has16BitInsts()) {
274     if (ST.hasVOP3PInsts())
275       FPOpActions.legalFor({S16, V2S16});
276     else
277       FPOpActions.legalFor({S16});
278   }
279 
280   auto &MinNumMaxNum = getActionDefinitionsBuilder({
281       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
282 
283   if (ST.hasVOP3PInsts()) {
284     MinNumMaxNum.customFor(FPTypesPK16)
285       .clampMaxNumElements(0, S16, 2)
286       .clampScalar(0, S16, S64)
287       .scalarize(0);
288   } else if (ST.has16BitInsts()) {
289     MinNumMaxNum.customFor(FPTypes16)
290       .clampScalar(0, S16, S64)
291       .scalarize(0);
292   } else {
293     MinNumMaxNum.customFor(FPTypesBase)
294       .clampScalar(0, S32, S64)
295       .scalarize(0);
296   }
297 
298   // TODO: Implement
299   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
300 
301   if (ST.hasVOP3PInsts())
302     FPOpActions.clampMaxNumElements(0, S16, 2);
303   FPOpActions
304     .scalarize(0)
305     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
306 
307   if (ST.has16BitInsts()) {
308     getActionDefinitionsBuilder(G_FSQRT)
309       .legalFor({S32, S64, S16})
310       .scalarize(0)
311       .clampScalar(0, S16, S64);
312   } else {
313     getActionDefinitionsBuilder(G_FSQRT)
314       .legalFor({S32, S64})
315       .scalarize(0)
316       .clampScalar(0, S32, S64);
317   }
318 
319   getActionDefinitionsBuilder(G_FPTRUNC)
320     .legalFor({{S32, S64}, {S16, S32}})
321     .scalarize(0);
322 
323   getActionDefinitionsBuilder(G_FPEXT)
324     .legalFor({{S64, S32}, {S32, S16}})
325     .lowerFor({{S64, S16}}) // FIXME: Implement
326     .scalarize(0);
327 
328   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
329   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
330 
331   getActionDefinitionsBuilder(G_FSUB)
332       // Use actual fsub instruction
333       .legalFor({S32})
334       // Must use fadd + fneg
335       .lowerFor({S64, S16, V2S16})
336       .scalarize(0)
337       .clampScalar(0, S32, S64);
338 
339   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
340     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
341                {S32, S1}, {S64, S1}, {S16, S1},
342                // FIXME: Hack
343                {S64, LLT::scalar(33)},
344                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
345     .scalarize(0);
346 
347   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
348     .legalFor({{S32, S32}, {S64, S32}})
349     .lowerFor({{S32, S64}})
350     .customFor({{S64, S64}})
351     .scalarize(0);
352 
353   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
354     .legalFor({{S32, S32}, {S32, S64}})
355     .scalarize(0);
356 
357   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
358     .legalFor({S32, S64})
359     .scalarize(0);
360 
361   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
362     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
363       .legalFor({S32, S64})
364       .clampScalar(0, S32, S64)
365       .scalarize(0);
366   } else {
367     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
368       .legalFor({S32})
369       .customFor({S64})
370       .clampScalar(0, S32, S64)
371       .scalarize(0);
372   }
373 
374   getActionDefinitionsBuilder(G_GEP)
375     .legalForCartesianProduct(AddrSpaces64, {S64})
376     .legalForCartesianProduct(AddrSpaces32, {S32})
377     .scalarize(0);
378 
379   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
380 
381   auto &CmpBuilder =
382     getActionDefinitionsBuilder(G_ICMP)
383     .legalForCartesianProduct(
384       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
385     .legalFor({{S1, S32}, {S1, S64}});
386   if (ST.has16BitInsts()) {
387     CmpBuilder.legalFor({{S1, S16}});
388   }
389 
390   CmpBuilder
391     .widenScalarToNextPow2(1)
392     .clampScalar(1, S32, S64)
393     .scalarize(0)
394     .legalIf(all(typeIs(0, S1), isPointer(1)));
395 
396   getActionDefinitionsBuilder(G_FCMP)
397     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
398     .widenScalarToNextPow2(1)
399     .clampScalar(1, S32, S64)
400     .scalarize(0);
401 
402   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
403   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
404                                G_FLOG, G_FLOG2, G_FLOG10})
405     .legalFor({S32})
406     .scalarize(0);
407 
408   // The 64-bit versions produce 32-bit results, but only on the SALU.
409   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
410                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
411                                G_CTPOP})
412     .legalFor({{S32, S32}, {S32, S64}})
413     .clampScalar(0, S32, S32)
414     .clampScalar(1, S32, S64)
415     .scalarize(0)
416     .widenScalarToNextPow2(0, 32)
417     .widenScalarToNextPow2(1, 32);
418 
419   // TODO: Expand for > s32
420   getActionDefinitionsBuilder(G_BSWAP)
421     .legalFor({S32})
422     .clampScalar(0, S32, S32)
423     .scalarize(0);
424 
425   if (ST.has16BitInsts()) {
426     if (ST.hasVOP3PInsts()) {
427       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
428         .legalFor({S32, S16, V2S16})
429         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
430         .clampMaxNumElements(0, S16, 2)
431         .clampScalar(0, S16, S32)
432         .widenScalarToNextPow2(0)
433         .scalarize(0);
434     } else {
435       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
436         .legalFor({S32, S16})
437         .widenScalarToNextPow2(0)
438         .clampScalar(0, S16, S32)
439         .scalarize(0);
440     }
441   } else {
442     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
443       .legalFor({S32})
444       .clampScalar(0, S32, S32)
445       .widenScalarToNextPow2(0)
446       .scalarize(0);
447   }
448 
449   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
450     return [=](const LegalityQuery &Query) {
451       return Query.Types[TypeIdx0].getSizeInBits() <
452              Query.Types[TypeIdx1].getSizeInBits();
453     };
454   };
455 
456   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
457     return [=](const LegalityQuery &Query) {
458       return Query.Types[TypeIdx0].getSizeInBits() >
459              Query.Types[TypeIdx1].getSizeInBits();
460     };
461   };
462 
463   getActionDefinitionsBuilder(G_INTTOPTR)
464     // List the common cases
465     .legalForCartesianProduct(AddrSpaces64, {S64})
466     .legalForCartesianProduct(AddrSpaces32, {S32})
467     .scalarize(0)
468     // Accept any address space as long as the size matches
469     .legalIf(sameSize(0, 1))
470     .widenScalarIf(smallerThan(1, 0),
471       [](const LegalityQuery &Query) {
472         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
473       })
474     .narrowScalarIf(greaterThan(1, 0),
475       [](const LegalityQuery &Query) {
476         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
477       });
478 
479   getActionDefinitionsBuilder(G_PTRTOINT)
480     // List the common cases
481     .legalForCartesianProduct(AddrSpaces64, {S64})
482     .legalForCartesianProduct(AddrSpaces32, {S32})
483     .scalarize(0)
484     // Accept any address space as long as the size matches
485     .legalIf(sameSize(0, 1))
486     .widenScalarIf(smallerThan(0, 1),
487       [](const LegalityQuery &Query) {
488         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
489       })
490     .narrowScalarIf(
491       greaterThan(0, 1),
492       [](const LegalityQuery &Query) {
493         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
494       });
495 
496   if (ST.hasFlatAddressSpace()) {
497     getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
498       .scalarize(0)
499       .custom();
500   }
501 
502   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
503   // handle some operations by just promoting the register during
504   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
505   getActionDefinitionsBuilder({G_LOAD, G_STORE})
506     .narrowScalarIf([](const LegalityQuery &Query) {
507         unsigned Size = Query.Types[0].getSizeInBits();
508         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
509         return (Size > 32 && MemSize < Size);
510       },
511       [](const LegalityQuery &Query) {
512         return std::make_pair(0, LLT::scalar(32));
513       })
514     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
515     .fewerElementsIf([=](const LegalityQuery &Query) {
516         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
517         return (MemSize == 96) &&
518                Query.Types[0].isVector() &&
519                !ST.hasDwordx3LoadStores();
520       },
521       [=](const LegalityQuery &Query) {
522         return std::make_pair(0, V2S32);
523       })
524     .legalIf([=](const LegalityQuery &Query) {
525         const LLT &Ty0 = Query.Types[0];
526 
527         unsigned Size = Ty0.getSizeInBits();
528         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
529         if (Size < 32 || (Size > 32 && MemSize < Size))
530           return false;
531 
532         if (Ty0.isVector() && Size != MemSize)
533           return false;
534 
535         // TODO: Decompose private loads into 4-byte components.
536         // TODO: Illegal flat loads on SI
537         switch (MemSize) {
538         case 8:
539         case 16:
540           return Size == 32;
541         case 32:
542         case 64:
543         case 128:
544           return true;
545 
546         case 96:
547           return ST.hasDwordx3LoadStores();
548 
549         case 256:
550         case 512:
551           // TODO: Possibly support loads of i256 and i512 .  This will require
552           // adding i256 and i512 types to MVT in order for to be able to use
553           // TableGen.
554           // TODO: Add support for other vector types, this will require
555           //       defining more value mappings for the new types.
556           return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
557                                     Ty0.getScalarType().getSizeInBits() == 64);
558 
559         default:
560           return false;
561         }
562       })
563     .clampScalar(0, S32, S64);
564 
565 
566   // FIXME: Handle alignment requirements.
567   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
568     .legalForTypesWithMemDesc({
569         {S32, GlobalPtr, 8, 8},
570         {S32, GlobalPtr, 16, 8},
571         {S32, LocalPtr, 8, 8},
572         {S32, LocalPtr, 16, 8},
573         {S32, PrivatePtr, 8, 8},
574         {S32, PrivatePtr, 16, 8}});
575   if (ST.hasFlatAddressSpace()) {
576     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
577                                        {S32, FlatPtr, 16, 8}});
578   }
579 
580   ExtLoads.clampScalar(0, S32, S32)
581           .widenScalarToNextPow2(0)
582           .unsupportedIfMemSizeNotPow2()
583           .lower();
584 
585   auto &Atomics = getActionDefinitionsBuilder(
586     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
587      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
588      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
589      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
590     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
591                {S64, GlobalPtr}, {S64, LocalPtr}});
592   if (ST.hasFlatAddressSpace()) {
593     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
594   }
595 
596   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
597     .legalFor({{S32, LocalPtr}});
598 
599   // TODO: Pointer types, any 32-bit or 64-bit vector
600   getActionDefinitionsBuilder(G_SELECT)
601     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
602           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
603           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
604     .clampScalar(0, S16, S64)
605     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
606     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
607     .scalarize(1)
608     .clampMaxNumElements(0, S32, 2)
609     .clampMaxNumElements(0, LocalPtr, 2)
610     .clampMaxNumElements(0, PrivatePtr, 2)
611     .scalarize(0)
612     .widenScalarToNextPow2(0)
613     .legalIf(all(isPointer(0), typeIs(1, S1)));
614 
615   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
616   // be more flexible with the shift amount type.
617   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
618     .legalFor({{S32, S32}, {S64, S32}});
619   if (ST.has16BitInsts()) {
620     if (ST.hasVOP3PInsts()) {
621       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
622             .clampMaxNumElements(0, S16, 2);
623     } else
624       Shifts.legalFor({{S16, S32}, {S16, S16}});
625 
626     Shifts.clampScalar(1, S16, S32);
627     Shifts.clampScalar(0, S16, S64);
628     Shifts.widenScalarToNextPow2(0, 16);
629   } else {
630     // Make sure we legalize the shift amount type first, as the general
631     // expansion for the shifted type will produce much worse code if it hasn't
632     // been truncated already.
633     Shifts.clampScalar(1, S32, S32);
634     Shifts.clampScalar(0, S32, S64);
635     Shifts.widenScalarToNextPow2(0, 32);
636   }
637   Shifts.scalarize(0);
638 
639   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
640     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
641     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
642     unsigned IdxTypeIdx = 2;
643 
644     getActionDefinitionsBuilder(Op)
645       .customIf([=](const LegalityQuery &Query) {
646           const LLT EltTy = Query.Types[EltTypeIdx];
647           const LLT VecTy = Query.Types[VecTypeIdx];
648           const LLT IdxTy = Query.Types[IdxTypeIdx];
649           return (EltTy.getSizeInBits() == 16 ||
650                   EltTy.getSizeInBits() % 32 == 0) &&
651                  VecTy.getSizeInBits() % 32 == 0 &&
652                  VecTy.getSizeInBits() <= 512 &&
653                  IdxTy.getSizeInBits() == 32;
654         })
655       .clampScalar(EltTypeIdx, S32, S64)
656       .clampScalar(VecTypeIdx, S32, S64)
657       .clampScalar(IdxTypeIdx, S32, S32);
658   }
659 
660   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
661     .unsupportedIf([=](const LegalityQuery &Query) {
662         const LLT &EltTy = Query.Types[1].getElementType();
663         return Query.Types[0] != EltTy;
664       });
665 
666   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
667     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
668     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
669 
670     // FIXME: Doesn't handle extract of illegal sizes.
671     getActionDefinitionsBuilder(Op)
672       .legalIf([=](const LegalityQuery &Query) {
673           const LLT BigTy = Query.Types[BigTyIdx];
674           const LLT LitTy = Query.Types[LitTyIdx];
675           return (BigTy.getSizeInBits() % 32 == 0) &&
676                  (LitTy.getSizeInBits() % 16 == 0);
677         })
678       .widenScalarIf(
679         [=](const LegalityQuery &Query) {
680           const LLT BigTy = Query.Types[BigTyIdx];
681           return (BigTy.getScalarSizeInBits() < 16);
682         },
683         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
684       .widenScalarIf(
685         [=](const LegalityQuery &Query) {
686           const LLT LitTy = Query.Types[LitTyIdx];
687           return (LitTy.getScalarSizeInBits() < 16);
688         },
689         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
690       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
691       .widenScalarToNextPow2(BigTyIdx, 32);
692 
693   }
694 
695   getActionDefinitionsBuilder(G_BUILD_VECTOR)
696       .legalForCartesianProduct(AllS32Vectors, {S32})
697       .legalForCartesianProduct(AllS64Vectors, {S64})
698       .clampNumElements(0, V16S32, V16S32)
699       .clampNumElements(0, V2S64, V8S64)
700       .minScalarSameAs(1, 0)
701       .legalIf(isRegisterType(0))
702       .minScalarOrElt(0, S32);
703 
704   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
705     .legalIf(isRegisterType(0));
706 
707   // Merge/Unmerge
708   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
709     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
710     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
711 
712     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
713       const LLT &Ty = Query.Types[TypeIdx];
714       if (Ty.isVector()) {
715         const LLT &EltTy = Ty.getElementType();
716         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
717           return true;
718         if (!isPowerOf2_32(EltTy.getSizeInBits()))
719           return true;
720       }
721       return false;
722     };
723 
724     getActionDefinitionsBuilder(Op)
725       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
726       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
727       // worth considering the multiples of 64 since 2*192 and 2*384 are not
728       // valid.
729       .clampScalar(LitTyIdx, S16, S256)
730       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
731 
732       // Break up vectors with weird elements into scalars
733       .fewerElementsIf(
734         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
735         scalarize(0))
736       .fewerElementsIf(
737         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
738         scalarize(1))
739       .clampScalar(BigTyIdx, S32, S512)
740       .widenScalarIf(
741         [=](const LegalityQuery &Query) {
742           const LLT &Ty = Query.Types[BigTyIdx];
743           return !isPowerOf2_32(Ty.getSizeInBits()) &&
744                  Ty.getSizeInBits() % 16 != 0;
745         },
746         [=](const LegalityQuery &Query) {
747           // Pick the next power of 2, or a multiple of 64 over 128.
748           // Whichever is smaller.
749           const LLT &Ty = Query.Types[BigTyIdx];
750           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
751           if (NewSizeInBits >= 256) {
752             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
753             if (RoundedTo < NewSizeInBits)
754               NewSizeInBits = RoundedTo;
755           }
756           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
757         })
758       .legalIf([=](const LegalityQuery &Query) {
759           const LLT &BigTy = Query.Types[BigTyIdx];
760           const LLT &LitTy = Query.Types[LitTyIdx];
761 
762           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
763             return false;
764           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
765             return false;
766 
767           return BigTy.getSizeInBits() % 16 == 0 &&
768                  LitTy.getSizeInBits() % 16 == 0 &&
769                  BigTy.getSizeInBits() <= 512;
770         })
771       // Any vectors left are the wrong size. Scalarize them.
772       .scalarize(0)
773       .scalarize(1);
774   }
775 
776   computeTables();
777   verify(*ST.getInstrInfo());
778 }
779 
780 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
781                                          MachineRegisterInfo &MRI,
782                                          MachineIRBuilder &MIRBuilder,
783                                          GISelChangeObserver &Observer) const {
784   switch (MI.getOpcode()) {
785   case TargetOpcode::G_ADDRSPACE_CAST:
786     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
787   case TargetOpcode::G_FRINT:
788     return legalizeFrint(MI, MRI, MIRBuilder);
789   case TargetOpcode::G_FCEIL:
790     return legalizeFceil(MI, MRI, MIRBuilder);
791   case TargetOpcode::G_INTRINSIC_TRUNC:
792     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
793   case TargetOpcode::G_SITOFP:
794     return legalizeITOFP(MI, MRI, MIRBuilder, true);
795   case TargetOpcode::G_UITOFP:
796     return legalizeITOFP(MI, MRI, MIRBuilder, false);
797   case TargetOpcode::G_FMINNUM:
798   case TargetOpcode::G_FMAXNUM:
799   case TargetOpcode::G_FMINNUM_IEEE:
800   case TargetOpcode::G_FMAXNUM_IEEE:
801     return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
802   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
803     return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
804   case TargetOpcode::G_INSERT_VECTOR_ELT:
805     return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
806   default:
807     return false;
808   }
809 
810   llvm_unreachable("expected switch to return");
811 }
812 
813 Register AMDGPULegalizerInfo::getSegmentAperture(
814   unsigned AS,
815   MachineRegisterInfo &MRI,
816   MachineIRBuilder &MIRBuilder) const {
817   MachineFunction &MF = MIRBuilder.getMF();
818   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
819   const LLT S32 = LLT::scalar(32);
820 
821   if (ST.hasApertureRegs()) {
822     // FIXME: Use inline constants (src_{shared, private}_base) instead of
823     // getreg.
824     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
825         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
826         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
827     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
828         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
829         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
830     unsigned Encoding =
831         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
832         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
833         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
834 
835     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
836     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
837 
838     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
839       .addDef(GetReg)
840       .addImm(Encoding);
841     MRI.setType(GetReg, S32);
842 
843     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
844     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
845       .addDef(ApertureReg)
846       .addUse(GetReg)
847       .addUse(ShiftAmt.getReg(0));
848 
849     return ApertureReg;
850   }
851 
852   Register QueuePtr = MRI.createGenericVirtualRegister(
853     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
854 
855   // FIXME: Placeholder until we can track the input registers.
856   MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
857 
858   // Offset into amd_queue_t for group_segment_aperture_base_hi /
859   // private_segment_aperture_base_hi.
860   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
861 
862   // FIXME: Don't use undef
863   Value *V = UndefValue::get(PointerType::get(
864                                Type::getInt8Ty(MF.getFunction().getContext()),
865                                AMDGPUAS::CONSTANT_ADDRESS));
866 
867   MachinePointerInfo PtrInfo(V, StructOffset);
868   MachineMemOperand *MMO = MF.getMachineMemOperand(
869     PtrInfo,
870     MachineMemOperand::MOLoad |
871     MachineMemOperand::MODereferenceable |
872     MachineMemOperand::MOInvariant,
873     4,
874     MinAlign(64, StructOffset));
875 
876   Register LoadResult = MRI.createGenericVirtualRegister(S32);
877   Register LoadAddr;
878 
879   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
880   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
881   return LoadResult;
882 }
883 
884 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
885   MachineInstr &MI, MachineRegisterInfo &MRI,
886   MachineIRBuilder &MIRBuilder) const {
887   MachineFunction &MF = MIRBuilder.getMF();
888 
889   MIRBuilder.setInstr(MI);
890 
891   Register Dst = MI.getOperand(0).getReg();
892   Register Src = MI.getOperand(1).getReg();
893 
894   LLT DstTy = MRI.getType(Dst);
895   LLT SrcTy = MRI.getType(Src);
896   unsigned DestAS = DstTy.getAddressSpace();
897   unsigned SrcAS = SrcTy.getAddressSpace();
898 
899   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
900   // vector element.
901   assert(!DstTy.isVector());
902 
903   const AMDGPUTargetMachine &TM
904     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
905 
906   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
907   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
908     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
909     return true;
910   }
911 
912   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
913     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
914            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
915     unsigned NullVal = TM.getNullPointerValue(DestAS);
916 
917     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
918     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
919 
920     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
921 
922     // Extract low 32-bits of the pointer.
923     MIRBuilder.buildExtract(PtrLo32, Src, 0);
924 
925     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
926     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
927     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
928 
929     MI.eraseFromParent();
930     return true;
931   }
932 
933   assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
934          SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
935 
936   auto SegmentNull =
937       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
938   auto FlatNull =
939       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
940 
941   Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
942 
943   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
944   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
945 
946   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
947 
948   // Coerce the type of the low half of the result so we can use merge_values.
949   Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
950   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
951     .addDef(SrcAsInt)
952     .addUse(Src);
953 
954   // TODO: Should we allow mismatched types but matching sizes in merges to
955   // avoid the ptrtoint?
956   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
957   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
958 
959   MI.eraseFromParent();
960   return true;
961 }
962 
963 bool AMDGPULegalizerInfo::legalizeFrint(
964   MachineInstr &MI, MachineRegisterInfo &MRI,
965   MachineIRBuilder &MIRBuilder) const {
966   MIRBuilder.setInstr(MI);
967 
968   Register Src = MI.getOperand(1).getReg();
969   LLT Ty = MRI.getType(Src);
970   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
971 
972   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
973   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
974 
975   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
976   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
977 
978   // TODO: Should this propagate fast-math-flags?
979   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
980   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
981 
982   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
983   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
984 
985   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
986   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
987   return true;
988 }
989 
990 bool AMDGPULegalizerInfo::legalizeFceil(
991   MachineInstr &MI, MachineRegisterInfo &MRI,
992   MachineIRBuilder &B) const {
993   B.setInstr(MI);
994 
995   const LLT S1 = LLT::scalar(1);
996   const LLT S64 = LLT::scalar(64);
997 
998   Register Src = MI.getOperand(1).getReg();
999   assert(MRI.getType(Src) == S64);
1000 
1001   // result = trunc(src)
1002   // if (src > 0.0 && src != result)
1003   //   result += 1.0
1004 
1005   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1006 
1007   const auto Zero = B.buildFConstant(S64, 0.0);
1008   const auto One = B.buildFConstant(S64, 1.0);
1009   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1010   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1011   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1012   auto Add = B.buildSelect(S64, And, One, Zero);
1013 
1014   // TODO: Should this propagate fast-math-flags?
1015   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1016   return true;
1017 }
1018 
1019 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1020                                               MachineIRBuilder &B) {
1021   const unsigned FractBits = 52;
1022   const unsigned ExpBits = 11;
1023   LLT S32 = LLT::scalar(32);
1024 
1025   auto Const0 = B.buildConstant(S32, FractBits - 32);
1026   auto Const1 = B.buildConstant(S32, ExpBits);
1027 
1028   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1029     .addUse(Const0.getReg(0))
1030     .addUse(Const1.getReg(0));
1031 
1032   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1033 }
1034 
1035 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1036   MachineInstr &MI, MachineRegisterInfo &MRI,
1037   MachineIRBuilder &B) const {
1038   B.setInstr(MI);
1039 
1040   const LLT S1 = LLT::scalar(1);
1041   const LLT S32 = LLT::scalar(32);
1042   const LLT S64 = LLT::scalar(64);
1043 
1044   Register Src = MI.getOperand(1).getReg();
1045   assert(MRI.getType(Src) == S64);
1046 
1047   // TODO: Should this use extract since the low half is unused?
1048   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1049   Register Hi = Unmerge.getReg(1);
1050 
1051   // Extract the upper half, since this is where we will find the sign and
1052   // exponent.
1053   auto Exp = extractF64Exponent(Hi, B);
1054 
1055   const unsigned FractBits = 52;
1056 
1057   // Extract the sign bit.
1058   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1059   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1060 
1061   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1062 
1063   const auto Zero32 = B.buildConstant(S32, 0);
1064 
1065   // Extend back to 64-bits.
1066   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1067 
1068   auto Shr = B.buildAShr(S64, FractMask, Exp);
1069   auto Not = B.buildNot(S64, Shr);
1070   auto Tmp0 = B.buildAnd(S64, Src, Not);
1071   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1072 
1073   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1074   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1075 
1076   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1077   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1078   return true;
1079 }
1080 
1081 bool AMDGPULegalizerInfo::legalizeITOFP(
1082   MachineInstr &MI, MachineRegisterInfo &MRI,
1083   MachineIRBuilder &B, bool Signed) const {
1084   B.setInstr(MI);
1085 
1086   Register Dst = MI.getOperand(0).getReg();
1087   Register Src = MI.getOperand(1).getReg();
1088 
1089   const LLT S64 = LLT::scalar(64);
1090   const LLT S32 = LLT::scalar(32);
1091 
1092   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1093 
1094   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1095 
1096   auto CvtHi = Signed ?
1097     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1098     B.buildUITOFP(S64, Unmerge.getReg(1));
1099 
1100   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1101 
1102   auto ThirtyTwo = B.buildConstant(S32, 32);
1103   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1104     .addUse(CvtHi.getReg(0))
1105     .addUse(ThirtyTwo.getReg(0));
1106 
1107   // TODO: Should this propagate fast-math-flags?
1108   B.buildFAdd(Dst, LdExp, CvtLo);
1109   MI.eraseFromParent();
1110   return true;
1111 }
1112 
1113 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1114   MachineInstr &MI, MachineRegisterInfo &MRI,
1115   MachineIRBuilder &B) const {
1116   MachineFunction &MF = B.getMF();
1117   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1118 
1119   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1120                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1121 
1122   // With ieee_mode disabled, the instructions have the correct behavior
1123   // already for G_FMINNUM/G_FMAXNUM
1124   if (!MFI->getMode().IEEE)
1125     return !IsIEEEOp;
1126 
1127   if (IsIEEEOp)
1128     return true;
1129 
1130   MachineIRBuilder HelperBuilder(MI);
1131   GISelObserverWrapper DummyObserver;
1132   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1133   HelperBuilder.setMBB(*MI.getParent());
1134   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1135 }
1136 
1137 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1138   MachineInstr &MI, MachineRegisterInfo &MRI,
1139   MachineIRBuilder &B) const {
1140   // TODO: Should move some of this into LegalizerHelper.
1141 
1142   // TODO: Promote dynamic indexing of s16 to s32
1143   // TODO: Dynamic s64 indexing is only legal for SGPR.
1144   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1145   if (!IdxVal) // Dynamic case will be selected to register indexing.
1146     return true;
1147 
1148   Register Dst = MI.getOperand(0).getReg();
1149   Register Vec = MI.getOperand(1).getReg();
1150 
1151   LLT VecTy = MRI.getType(Vec);
1152   LLT EltTy = VecTy.getElementType();
1153   assert(EltTy == MRI.getType(Dst));
1154 
1155   B.setInstr(MI);
1156 
1157   if (IdxVal.getValue() < VecTy.getNumElements())
1158     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1159   else
1160     B.buildUndef(Dst);
1161 
1162   MI.eraseFromParent();
1163   return true;
1164 }
1165 
1166 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1167   MachineInstr &MI, MachineRegisterInfo &MRI,
1168   MachineIRBuilder &B) const {
1169   // TODO: Should move some of this into LegalizerHelper.
1170 
1171   // TODO: Promote dynamic indexing of s16 to s32
1172   // TODO: Dynamic s64 indexing is only legal for SGPR.
1173   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1174   if (!IdxVal) // Dynamic case will be selected to register indexing.
1175     return true;
1176 
1177   Register Dst = MI.getOperand(0).getReg();
1178   Register Vec = MI.getOperand(1).getReg();
1179   Register Ins = MI.getOperand(2).getReg();
1180 
1181   LLT VecTy = MRI.getType(Vec);
1182   LLT EltTy = VecTy.getElementType();
1183   assert(EltTy == MRI.getType(Ins));
1184 
1185   B.setInstr(MI);
1186 
1187   if (IdxVal.getValue() < VecTy.getNumElements())
1188     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1189   else
1190     B.buildUndef(Dst);
1191 
1192   MI.eraseFromParent();
1193   return true;
1194 }
1195 
1196 // Return the use branch instruction, otherwise null if the usage is invalid.
1197 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1198                                        MachineRegisterInfo &MRI) {
1199   Register CondDef = MI.getOperand(0).getReg();
1200   if (!MRI.hasOneNonDBGUse(CondDef))
1201     return nullptr;
1202 
1203   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1204   return UseMI.getParent() == MI.getParent() &&
1205     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1206 }
1207 
1208 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1209                                                 Register Reg, LLT Ty) const {
1210   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1211   if (LiveIn)
1212     return LiveIn;
1213 
1214   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1215   MRI.addLiveIn(Reg, NewReg);
1216   return NewReg;
1217 }
1218 
1219 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1220                                          const ArgDescriptor *Arg) const {
1221   if (!Arg->isRegister())
1222     return false; // TODO: Handle these
1223 
1224   assert(Arg->getRegister() != 0);
1225   assert(Arg->getRegister().isPhysical());
1226 
1227   MachineRegisterInfo &MRI = *B.getMRI();
1228 
1229   LLT Ty = MRI.getType(DstReg);
1230   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1231 
1232   if (Arg->isMasked()) {
1233     // TODO: Should we try to emit this once in the entry block?
1234     const LLT S32 = LLT::scalar(32);
1235     const unsigned Mask = Arg->getMask();
1236     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1237 
1238     auto ShiftAmt = B.buildConstant(S32, Shift);
1239     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1240     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1241   } else
1242     B.buildCopy(DstReg, LiveIn);
1243 
1244   // Insert the argument copy if it doens't already exist.
1245   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1246   if (!MRI.getVRegDef(LiveIn)) {
1247     MachineBasicBlock &EntryMBB = B.getMF().front();
1248     EntryMBB.addLiveIn(Arg->getRegister());
1249     B.setInsertPt(EntryMBB, EntryMBB.begin());
1250     B.buildCopy(LiveIn, Arg->getRegister());
1251   }
1252 
1253   return true;
1254 }
1255 
1256 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1257   MachineInstr &MI,
1258   MachineRegisterInfo &MRI,
1259   MachineIRBuilder &B,
1260   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1261   B.setInstr(MI);
1262 
1263   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1264 
1265   const ArgDescriptor *Arg;
1266   const TargetRegisterClass *RC;
1267   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1268   if (!Arg) {
1269     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1270     return false;
1271   }
1272 
1273   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1274     MI.eraseFromParent();
1275     return true;
1276   }
1277 
1278   return false;
1279 }
1280 
1281 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1282                                            MachineRegisterInfo &MRI,
1283                                            MachineIRBuilder &B) const {
1284   B.setInstr(MI);
1285   Register Res = MI.getOperand(0).getReg();
1286   Register LHS = MI.getOperand(2).getReg();
1287   Register RHS = MI.getOperand(3).getReg();
1288   uint16_t Flags = MI.getFlags();
1289 
1290   LLT S32 = LLT::scalar(32);
1291   LLT S1 = LLT::scalar(1);
1292 
1293   auto Abs = B.buildFAbs(S32, RHS, Flags);
1294   const APFloat C0Val(1.0f);
1295 
1296   auto C0 = B.buildConstant(S32, 0x6f800000);
1297   auto C1 = B.buildConstant(S32, 0x2f800000);
1298   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1299 
1300   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1301   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1302 
1303   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1304 
1305   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1306     .addUse(Mul0.getReg(0))
1307     .setMIFlags(Flags);
1308 
1309   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1310 
1311   B.buildFMul(Res, Sel, Mul1, Flags);
1312 
1313   MI.eraseFromParent();
1314   return true;
1315 }
1316 
1317 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1318                                                  MachineRegisterInfo &MRI,
1319                                                  MachineIRBuilder &B) const {
1320   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1321   if (!MFI->isEntryFunction()) {
1322     return legalizePreloadedArgIntrin(MI, MRI, B,
1323                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1324   }
1325 
1326   B.setInstr(MI);
1327 
1328   uint64_t Offset =
1329     ST.getTargetLowering()->getImplicitParameterOffset(
1330       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1331   Register DstReg = MI.getOperand(0).getReg();
1332   LLT DstTy = MRI.getType(DstReg);
1333   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1334 
1335   const ArgDescriptor *Arg;
1336   const TargetRegisterClass *RC;
1337   std::tie(Arg, RC)
1338     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1339   if (!Arg)
1340     return false;
1341 
1342   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1343   if (!loadInputValue(KernargPtrReg, B, Arg))
1344     return false;
1345 
1346   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1347   MI.eraseFromParent();
1348   return true;
1349 }
1350 
1351 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1352                                             MachineRegisterInfo &MRI,
1353                                             MachineIRBuilder &B) const {
1354   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1355   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1356   case Intrinsic::amdgcn_if: {
1357     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1358       const SIRegisterInfo *TRI
1359         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1360 
1361       B.setInstr(*BrCond);
1362       Register Def = MI.getOperand(1).getReg();
1363       Register Use = MI.getOperand(3).getReg();
1364       B.buildInstr(AMDGPU::SI_IF)
1365         .addDef(Def)
1366         .addUse(Use)
1367         .addMBB(BrCond->getOperand(1).getMBB());
1368 
1369       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1370       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1371       MI.eraseFromParent();
1372       BrCond->eraseFromParent();
1373       return true;
1374     }
1375 
1376     return false;
1377   }
1378   case Intrinsic::amdgcn_loop: {
1379     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1380       const SIRegisterInfo *TRI
1381         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1382 
1383       B.setInstr(*BrCond);
1384       Register Reg = MI.getOperand(2).getReg();
1385       B.buildInstr(AMDGPU::SI_LOOP)
1386         .addUse(Reg)
1387         .addMBB(BrCond->getOperand(1).getMBB());
1388       MI.eraseFromParent();
1389       BrCond->eraseFromParent();
1390       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1391       return true;
1392     }
1393 
1394     return false;
1395   }
1396   case Intrinsic::amdgcn_kernarg_segment_ptr:
1397     return legalizePreloadedArgIntrin(
1398       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1399   case Intrinsic::amdgcn_implicitarg_ptr:
1400     return legalizeImplicitArgPtr(MI, MRI, B);
1401   case Intrinsic::amdgcn_workitem_id_x:
1402     return legalizePreloadedArgIntrin(MI, MRI, B,
1403                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1404   case Intrinsic::amdgcn_workitem_id_y:
1405     return legalizePreloadedArgIntrin(MI, MRI, B,
1406                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1407   case Intrinsic::amdgcn_workitem_id_z:
1408     return legalizePreloadedArgIntrin(MI, MRI, B,
1409                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1410   case Intrinsic::amdgcn_workgroup_id_x:
1411     return legalizePreloadedArgIntrin(MI, MRI, B,
1412                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1413   case Intrinsic::amdgcn_workgroup_id_y:
1414     return legalizePreloadedArgIntrin(MI, MRI, B,
1415                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1416   case Intrinsic::amdgcn_workgroup_id_z:
1417     return legalizePreloadedArgIntrin(MI, MRI, B,
1418                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1419   case Intrinsic::amdgcn_dispatch_ptr:
1420     return legalizePreloadedArgIntrin(MI, MRI, B,
1421                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1422   case Intrinsic::amdgcn_queue_ptr:
1423     return legalizePreloadedArgIntrin(MI, MRI, B,
1424                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1425   case Intrinsic::amdgcn_implicit_buffer_ptr:
1426     return legalizePreloadedArgIntrin(
1427       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1428   case Intrinsic::amdgcn_dispatch_id:
1429     return legalizePreloadedArgIntrin(MI, MRI, B,
1430                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1431   case Intrinsic::amdgcn_fdiv_fast:
1432     return legalizeFDIVFast(MI, MRI, B);
1433   default:
1434     return true;
1435   }
1436 
1437   return true;
1438 }
1439