1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/TargetOpcodes.h"
21 #include "llvm/CodeGen/ValueTypes.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/Support/Debug.h"
25 
26 #define DEBUG_TYPE "amdgpu-legalinfo"
27 
28 using namespace llvm;
29 using namespace LegalizeActions;
30 using namespace LegalizeMutations;
31 using namespace LegalityPredicates;
32 
33 
34 static LegalityPredicate isMultiple32(unsigned TypeIdx,
35                                       unsigned MaxSize = 512) {
36   return [=](const LegalityQuery &Query) {
37     const LLT Ty = Query.Types[TypeIdx];
38     const LLT EltTy = Ty.getScalarType();
39     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
40   };
41 }
42 
43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     return Ty.isVector() &&
47            Ty.getNumElements() % 2 != 0 &&
48            Ty.getElementType().getSizeInBits() < 32;
49   };
50 }
51 
52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
53   return [=](const LegalityQuery &Query) {
54     const LLT Ty = Query.Types[TypeIdx];
55     const LLT EltTy = Ty.getElementType();
56     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
57   };
58 }
59 
60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getElementType();
64     unsigned Size = Ty.getSizeInBits();
65     unsigned Pieces = (Size + 63) / 64;
66     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
67     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
68   };
69 }
70 
71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
72   return [=](const LegalityQuery &Query) {
73     const LLT QueryTy = Query.Types[TypeIdx];
74     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
75   };
76 }
77 
78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
79   return [=](const LegalityQuery &Query) {
80     const LLT QueryTy = Query.Types[TypeIdx];
81     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
82   };
83 }
84 
85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
86 // v2s16.
87 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     if (Ty.isVector()) {
91       const int EltSize = Ty.getElementType().getSizeInBits();
92       return EltSize == 32 || EltSize == 64 ||
93             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
94              EltSize == 128 || EltSize == 256;
95     }
96 
97     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
98   };
99 }
100 
101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
102                                          const GCNTargetMachine &TM)
103   :  ST(ST_) {
104   using namespace TargetOpcode;
105 
106   auto GetAddrSpacePtr = [&TM](unsigned AS) {
107     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
108   };
109 
110   const LLT S1 = LLT::scalar(1);
111   const LLT S8 = LLT::scalar(8);
112   const LLT S16 = LLT::scalar(16);
113   const LLT S32 = LLT::scalar(32);
114   const LLT S64 = LLT::scalar(64);
115   const LLT S128 = LLT::scalar(128);
116   const LLT S256 = LLT::scalar(256);
117   const LLT S512 = LLT::scalar(512);
118 
119   const LLT V2S16 = LLT::vector(2, 16);
120   const LLT V4S16 = LLT::vector(4, 16);
121 
122   const LLT V2S32 = LLT::vector(2, 32);
123   const LLT V3S32 = LLT::vector(3, 32);
124   const LLT V4S32 = LLT::vector(4, 32);
125   const LLT V5S32 = LLT::vector(5, 32);
126   const LLT V6S32 = LLT::vector(6, 32);
127   const LLT V7S32 = LLT::vector(7, 32);
128   const LLT V8S32 = LLT::vector(8, 32);
129   const LLT V9S32 = LLT::vector(9, 32);
130   const LLT V10S32 = LLT::vector(10, 32);
131   const LLT V11S32 = LLT::vector(11, 32);
132   const LLT V12S32 = LLT::vector(12, 32);
133   const LLT V13S32 = LLT::vector(13, 32);
134   const LLT V14S32 = LLT::vector(14, 32);
135   const LLT V15S32 = LLT::vector(15, 32);
136   const LLT V16S32 = LLT::vector(16, 32);
137 
138   const LLT V2S64 = LLT::vector(2, 64);
139   const LLT V3S64 = LLT::vector(3, 64);
140   const LLT V4S64 = LLT::vector(4, 64);
141   const LLT V5S64 = LLT::vector(5, 64);
142   const LLT V6S64 = LLT::vector(6, 64);
143   const LLT V7S64 = LLT::vector(7, 64);
144   const LLT V8S64 = LLT::vector(8, 64);
145 
146   std::initializer_list<LLT> AllS32Vectors =
147     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
148      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
149   std::initializer_list<LLT> AllS64Vectors =
150     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
151 
152   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
153   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
154   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
155   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
156   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
157   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
158   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
159 
160   const LLT CodePtr = FlatPtr;
161 
162   const std::initializer_list<LLT> AddrSpaces64 = {
163     GlobalPtr, ConstantPtr, FlatPtr
164   };
165 
166   const std::initializer_list<LLT> AddrSpaces32 = {
167     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
168   };
169 
170   const std::initializer_list<LLT> FPTypesBase = {
171     S32, S64
172   };
173 
174   const std::initializer_list<LLT> FPTypes16 = {
175     S32, S64, S16
176   };
177 
178   const std::initializer_list<LLT> FPTypesPK16 = {
179     S32, S64, S16, V2S16
180   };
181 
182   setAction({G_BRCOND, S1}, Legal);
183 
184   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
185   // elements for v3s16
186   getActionDefinitionsBuilder(G_PHI)
187     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
188     .legalFor(AllS32Vectors)
189     .legalFor(AllS64Vectors)
190     .legalFor(AddrSpaces64)
191     .legalFor(AddrSpaces32)
192     .clampScalar(0, S32, S256)
193     .widenScalarToNextPow2(0, 32)
194     .clampMaxNumElements(0, S32, 16)
195     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
196     .legalIf(isPointer(0));
197 
198   if (ST.has16BitInsts()) {
199     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
200       .legalFor({S32, S16})
201       .clampScalar(0, S16, S32)
202       .scalarize(0);
203   } else {
204     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
205       .legalFor({S32})
206       .clampScalar(0, S32, S32)
207       .scalarize(0);
208   }
209 
210   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
211     .legalFor({S32})
212     .clampScalar(0, S32, S32)
213     .scalarize(0);
214 
215   // Report legal for any types we can handle anywhere. For the cases only legal
216   // on the SALU, RegBankSelect will be able to re-legalize.
217   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
218     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
219     .clampScalar(0, S32, S64)
220     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
221     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
222     .widenScalarToNextPow2(0)
223     .scalarize(0);
224 
225   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
226                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
227     .legalFor({{S32, S1}})
228     .clampScalar(0, S32, S32);
229 
230   getActionDefinitionsBuilder(G_BITCAST)
231     .legalForCartesianProduct({S32, V2S16})
232     .legalForCartesianProduct({S64, V2S32, V4S16})
233     .legalForCartesianProduct({V2S64, V4S32})
234     // Don't worry about the size constraint.
235     .legalIf(all(isPointer(0), isPointer(1)));
236 
237   if (ST.has16BitInsts()) {
238     getActionDefinitionsBuilder(G_FCONSTANT)
239       .legalFor({S32, S64, S16})
240       .clampScalar(0, S16, S64);
241   } else {
242     getActionDefinitionsBuilder(G_FCONSTANT)
243       .legalFor({S32, S64})
244       .clampScalar(0, S32, S64);
245   }
246 
247   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
248     .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
249                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
250     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
251     .clampScalarOrElt(0, S32, S512)
252     .legalIf(isMultiple32(0))
253     .widenScalarToNextPow2(0, 32)
254     .clampMaxNumElements(0, S32, 16);
255 
256 
257   // FIXME: i1 operands to intrinsics should always be legal, but other i1
258   // values may not be legal.  We need to figure out how to distinguish
259   // between these two scenarios.
260   getActionDefinitionsBuilder(G_CONSTANT)
261     .legalFor({S1, S32, S64, GlobalPtr,
262                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
263     .clampScalar(0, S32, S64)
264     .widenScalarToNextPow2(0)
265     .legalIf(isPointer(0));
266 
267   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
268 
269   auto &FPOpActions = getActionDefinitionsBuilder(
270     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
271     .legalFor({S32, S64});
272 
273   if (ST.has16BitInsts()) {
274     if (ST.hasVOP3PInsts())
275       FPOpActions.legalFor({S16, V2S16});
276     else
277       FPOpActions.legalFor({S16});
278   }
279 
280   auto &MinNumMaxNum = getActionDefinitionsBuilder({
281       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
282 
283   if (ST.hasVOP3PInsts()) {
284     MinNumMaxNum.customFor(FPTypesPK16)
285       .clampMaxNumElements(0, S16, 2)
286       .clampScalar(0, S16, S64)
287       .scalarize(0);
288   } else if (ST.has16BitInsts()) {
289     MinNumMaxNum.customFor(FPTypes16)
290       .clampScalar(0, S16, S64)
291       .scalarize(0);
292   } else {
293     MinNumMaxNum.customFor(FPTypesBase)
294       .clampScalar(0, S32, S64)
295       .scalarize(0);
296   }
297 
298   // TODO: Implement
299   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
300 
301   if (ST.hasVOP3PInsts())
302     FPOpActions.clampMaxNumElements(0, S16, 2);
303   FPOpActions
304     .scalarize(0)
305     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
306 
307   if (ST.has16BitInsts()) {
308     getActionDefinitionsBuilder(G_FSQRT)
309       .legalFor({S32, S64, S16})
310       .scalarize(0)
311       .clampScalar(0, S16, S64);
312   } else {
313     getActionDefinitionsBuilder(G_FSQRT)
314       .legalFor({S32, S64})
315       .scalarize(0)
316       .clampScalar(0, S32, S64);
317   }
318 
319   getActionDefinitionsBuilder(G_FPTRUNC)
320     .legalFor({{S32, S64}, {S16, S32}})
321     .scalarize(0);
322 
323   getActionDefinitionsBuilder(G_FPEXT)
324     .legalFor({{S64, S32}, {S32, S16}})
325     .lowerFor({{S64, S16}}) // FIXME: Implement
326     .scalarize(0);
327 
328   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
329   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
330 
331   getActionDefinitionsBuilder(G_FSUB)
332       // Use actual fsub instruction
333       .legalFor({S32})
334       // Must use fadd + fneg
335       .lowerFor({S64, S16, V2S16})
336       .scalarize(0)
337       .clampScalar(0, S32, S64);
338 
339   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
340     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
341                {S32, S1}, {S64, S1}, {S16, S1},
342                // FIXME: Hack
343                {S64, LLT::scalar(33)},
344                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
345     .scalarize(0);
346 
347   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
348     .legalFor({{S32, S32}, {S64, S32}})
349     .lowerFor({{S32, S64}})
350     .customFor({{S64, S64}})
351     .scalarize(0);
352 
353   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
354     .legalFor({{S32, S32}, {S32, S64}})
355     .scalarize(0);
356 
357   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
358     .legalFor({S32, S64})
359     .scalarize(0);
360 
361   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
362     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
363       .legalFor({S32, S64})
364       .clampScalar(0, S32, S64)
365       .scalarize(0);
366   } else {
367     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
368       .legalFor({S32})
369       .customFor({S64})
370       .clampScalar(0, S32, S64)
371       .scalarize(0);
372   }
373 
374   getActionDefinitionsBuilder(G_GEP)
375     .legalForCartesianProduct(AddrSpaces64, {S64})
376     .legalForCartesianProduct(AddrSpaces32, {S32})
377     .scalarize(0);
378 
379   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
380 
381   auto &CmpBuilder =
382     getActionDefinitionsBuilder(G_ICMP)
383     .legalForCartesianProduct(
384       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
385     .legalFor({{S1, S32}, {S1, S64}});
386   if (ST.has16BitInsts()) {
387     CmpBuilder.legalFor({{S1, S16}});
388   }
389 
390   CmpBuilder
391     .widenScalarToNextPow2(1)
392     .clampScalar(1, S32, S64)
393     .scalarize(0)
394     .legalIf(all(typeIs(0, S1), isPointer(1)));
395 
396   getActionDefinitionsBuilder(G_FCMP)
397     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
398     .widenScalarToNextPow2(1)
399     .clampScalar(1, S32, S64)
400     .scalarize(0);
401 
402   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
403   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
404                                G_FLOG, G_FLOG2, G_FLOG10})
405     .legalFor({S32})
406     .scalarize(0);
407 
408   // The 64-bit versions produce 32-bit results, but only on the SALU.
409   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
410                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
411                                G_CTPOP})
412     .legalFor({{S32, S32}, {S32, S64}})
413     .clampScalar(0, S32, S32)
414     .clampScalar(1, S32, S64)
415     .scalarize(0)
416     .widenScalarToNextPow2(0, 32)
417     .widenScalarToNextPow2(1, 32);
418 
419   // TODO: Expand for > s32
420   getActionDefinitionsBuilder(G_BSWAP)
421     .legalFor({S32})
422     .clampScalar(0, S32, S32)
423     .scalarize(0);
424 
425   if (ST.has16BitInsts()) {
426     if (ST.hasVOP3PInsts()) {
427       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
428         .legalFor({S32, S16, V2S16})
429         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
430         .clampMaxNumElements(0, S16, 2)
431         .clampScalar(0, S16, S32)
432         .widenScalarToNextPow2(0)
433         .scalarize(0);
434     } else {
435       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
436         .legalFor({S32, S16})
437         .widenScalarToNextPow2(0)
438         .clampScalar(0, S16, S32)
439         .scalarize(0);
440     }
441   } else {
442     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
443       .legalFor({S32})
444       .clampScalar(0, S32, S32)
445       .widenScalarToNextPow2(0)
446       .scalarize(0);
447   }
448 
449   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
450     return [=](const LegalityQuery &Query) {
451       return Query.Types[TypeIdx0].getSizeInBits() <
452              Query.Types[TypeIdx1].getSizeInBits();
453     };
454   };
455 
456   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
457     return [=](const LegalityQuery &Query) {
458       return Query.Types[TypeIdx0].getSizeInBits() >
459              Query.Types[TypeIdx1].getSizeInBits();
460     };
461   };
462 
463   getActionDefinitionsBuilder(G_INTTOPTR)
464     // List the common cases
465     .legalForCartesianProduct(AddrSpaces64, {S64})
466     .legalForCartesianProduct(AddrSpaces32, {S32})
467     .scalarize(0)
468     // Accept any address space as long as the size matches
469     .legalIf(sameSize(0, 1))
470     .widenScalarIf(smallerThan(1, 0),
471       [](const LegalityQuery &Query) {
472         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
473       })
474     .narrowScalarIf(greaterThan(1, 0),
475       [](const LegalityQuery &Query) {
476         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
477       });
478 
479   getActionDefinitionsBuilder(G_PTRTOINT)
480     // List the common cases
481     .legalForCartesianProduct(AddrSpaces64, {S64})
482     .legalForCartesianProduct(AddrSpaces32, {S32})
483     .scalarize(0)
484     // Accept any address space as long as the size matches
485     .legalIf(sameSize(0, 1))
486     .widenScalarIf(smallerThan(0, 1),
487       [](const LegalityQuery &Query) {
488         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
489       })
490     .narrowScalarIf(
491       greaterThan(0, 1),
492       [](const LegalityQuery &Query) {
493         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
494       });
495 
496   if (ST.hasFlatAddressSpace()) {
497     getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
498       .scalarize(0)
499       .custom();
500   }
501 
502   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
503   // handle some operations by just promoting the register during
504   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
505   getActionDefinitionsBuilder({G_LOAD, G_STORE})
506     .narrowScalarIf([](const LegalityQuery &Query) {
507         unsigned Size = Query.Types[0].getSizeInBits();
508         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
509         return (Size > 32 && MemSize < Size);
510       },
511       [](const LegalityQuery &Query) {
512         return std::make_pair(0, LLT::scalar(32));
513       })
514     .fewerElementsIf([=](const LegalityQuery &Query) {
515         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
516         return (MemSize == 96) &&
517                Query.Types[0].isVector() &&
518                !ST.hasDwordx3LoadStores();
519       },
520       [=](const LegalityQuery &Query) {
521         return std::make_pair(0, V2S32);
522       })
523     .legalIf([=](const LegalityQuery &Query) {
524         const LLT &Ty0 = Query.Types[0];
525 
526         unsigned Size = Ty0.getSizeInBits();
527         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
528         if (Size < 32 || (Size > 32 && MemSize < Size))
529           return false;
530 
531         if (Ty0.isVector() && Size != MemSize)
532           return false;
533 
534         // TODO: Decompose private loads into 4-byte components.
535         // TODO: Illegal flat loads on SI
536         switch (MemSize) {
537         case 8:
538         case 16:
539           return Size == 32;
540         case 32:
541         case 64:
542         case 128:
543           return true;
544 
545         case 96:
546           return ST.hasDwordx3LoadStores();
547 
548         case 256:
549         case 512:
550           // TODO: Possibly support loads of i256 and i512 .  This will require
551           // adding i256 and i512 types to MVT in order for to be able to use
552           // TableGen.
553           // TODO: Add support for other vector types, this will require
554           //       defining more value mappings for the new types.
555           return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
556                                     Ty0.getScalarType().getSizeInBits() == 64);
557 
558         default:
559           return false;
560         }
561       })
562     .clampScalar(0, S32, S64);
563 
564 
565   // FIXME: Handle alignment requirements.
566   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
567     .legalForTypesWithMemDesc({
568         {S32, GlobalPtr, 8, 8},
569         {S32, GlobalPtr, 16, 8},
570         {S32, LocalPtr, 8, 8},
571         {S32, LocalPtr, 16, 8},
572         {S32, PrivatePtr, 8, 8},
573         {S32, PrivatePtr, 16, 8}});
574   if (ST.hasFlatAddressSpace()) {
575     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
576                                        {S32, FlatPtr, 16, 8}});
577   }
578 
579   ExtLoads.clampScalar(0, S32, S32)
580           .widenScalarToNextPow2(0)
581           .unsupportedIfMemSizeNotPow2()
582           .lower();
583 
584   auto &Atomics = getActionDefinitionsBuilder(
585     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
586      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
587      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
588      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
589     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
590                {S64, GlobalPtr}, {S64, LocalPtr}});
591   if (ST.hasFlatAddressSpace()) {
592     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
593   }
594 
595   // TODO: Pointer types, any 32-bit or 64-bit vector
596   getActionDefinitionsBuilder(G_SELECT)
597     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
598           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
599           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
600     .clampScalar(0, S16, S64)
601     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
602     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
603     .scalarize(1)
604     .clampMaxNumElements(0, S32, 2)
605     .clampMaxNumElements(0, LocalPtr, 2)
606     .clampMaxNumElements(0, PrivatePtr, 2)
607     .scalarize(0)
608     .widenScalarToNextPow2(0)
609     .legalIf(all(isPointer(0), typeIs(1, S1)));
610 
611   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
612   // be more flexible with the shift amount type.
613   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
614     .legalFor({{S32, S32}, {S64, S32}});
615   if (ST.has16BitInsts()) {
616     if (ST.hasVOP3PInsts()) {
617       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
618             .clampMaxNumElements(0, S16, 2);
619     } else
620       Shifts.legalFor({{S16, S32}, {S16, S16}});
621 
622     Shifts.clampScalar(1, S16, S32);
623     Shifts.clampScalar(0, S16, S64);
624     Shifts.widenScalarToNextPow2(0, 16);
625   } else {
626     // Make sure we legalize the shift amount type first, as the general
627     // expansion for the shifted type will produce much worse code if it hasn't
628     // been truncated already.
629     Shifts.clampScalar(1, S32, S32);
630     Shifts.clampScalar(0, S32, S64);
631     Shifts.widenScalarToNextPow2(0, 32);
632   }
633   Shifts.scalarize(0);
634 
635   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
636     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
637     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
638     unsigned IdxTypeIdx = 2;
639 
640     getActionDefinitionsBuilder(Op)
641       .customIf([=](const LegalityQuery &Query) {
642           const LLT EltTy = Query.Types[EltTypeIdx];
643           const LLT VecTy = Query.Types[VecTypeIdx];
644           const LLT IdxTy = Query.Types[IdxTypeIdx];
645           return (EltTy.getSizeInBits() == 16 ||
646                   EltTy.getSizeInBits() % 32 == 0) &&
647                  VecTy.getSizeInBits() % 32 == 0 &&
648                  VecTy.getSizeInBits() <= 512 &&
649                  IdxTy.getSizeInBits() == 32;
650         })
651       .clampScalar(EltTypeIdx, S32, S64)
652       .clampScalar(VecTypeIdx, S32, S64)
653       .clampScalar(IdxTypeIdx, S32, S32);
654   }
655 
656   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
657     .unsupportedIf([=](const LegalityQuery &Query) {
658         const LLT &EltTy = Query.Types[1].getElementType();
659         return Query.Types[0] != EltTy;
660       });
661 
662   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
663     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
664     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
665 
666     // FIXME: Doesn't handle extract of illegal sizes.
667     getActionDefinitionsBuilder(Op)
668       .legalIf([=](const LegalityQuery &Query) {
669           const LLT BigTy = Query.Types[BigTyIdx];
670           const LLT LitTy = Query.Types[LitTyIdx];
671           return (BigTy.getSizeInBits() % 32 == 0) &&
672                  (LitTy.getSizeInBits() % 16 == 0);
673         })
674       .widenScalarIf(
675         [=](const LegalityQuery &Query) {
676           const LLT BigTy = Query.Types[BigTyIdx];
677           return (BigTy.getScalarSizeInBits() < 16);
678         },
679         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
680       .widenScalarIf(
681         [=](const LegalityQuery &Query) {
682           const LLT LitTy = Query.Types[LitTyIdx];
683           return (LitTy.getScalarSizeInBits() < 16);
684         },
685         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
686       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
687       .widenScalarToNextPow2(BigTyIdx, 32);
688 
689   }
690 
691   getActionDefinitionsBuilder(G_BUILD_VECTOR)
692       .legalForCartesianProduct(AllS32Vectors, {S32})
693       .legalForCartesianProduct(AllS64Vectors, {S64})
694       .clampNumElements(0, V16S32, V16S32)
695       .clampNumElements(0, V2S64, V8S64)
696       .minScalarSameAs(1, 0)
697       .legalIf(isRegisterType(0))
698       .minScalarOrElt(0, S32);
699 
700   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
701     .legalIf(isRegisterType(0));
702 
703   // Merge/Unmerge
704   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
705     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
706     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
707 
708     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
709       const LLT &Ty = Query.Types[TypeIdx];
710       if (Ty.isVector()) {
711         const LLT &EltTy = Ty.getElementType();
712         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
713           return true;
714         if (!isPowerOf2_32(EltTy.getSizeInBits()))
715           return true;
716       }
717       return false;
718     };
719 
720     getActionDefinitionsBuilder(Op)
721       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
722       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
723       // worth considering the multiples of 64 since 2*192 and 2*384 are not
724       // valid.
725       .clampScalar(LitTyIdx, S16, S256)
726       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
727 
728       // Break up vectors with weird elements into scalars
729       .fewerElementsIf(
730         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
731         scalarize(0))
732       .fewerElementsIf(
733         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
734         scalarize(1))
735       .clampScalar(BigTyIdx, S32, S512)
736       .widenScalarIf(
737         [=](const LegalityQuery &Query) {
738           const LLT &Ty = Query.Types[BigTyIdx];
739           return !isPowerOf2_32(Ty.getSizeInBits()) &&
740                  Ty.getSizeInBits() % 16 != 0;
741         },
742         [=](const LegalityQuery &Query) {
743           // Pick the next power of 2, or a multiple of 64 over 128.
744           // Whichever is smaller.
745           const LLT &Ty = Query.Types[BigTyIdx];
746           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
747           if (NewSizeInBits >= 256) {
748             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
749             if (RoundedTo < NewSizeInBits)
750               NewSizeInBits = RoundedTo;
751           }
752           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
753         })
754       .legalIf([=](const LegalityQuery &Query) {
755           const LLT &BigTy = Query.Types[BigTyIdx];
756           const LLT &LitTy = Query.Types[LitTyIdx];
757 
758           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
759             return false;
760           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
761             return false;
762 
763           return BigTy.getSizeInBits() % 16 == 0 &&
764                  LitTy.getSizeInBits() % 16 == 0 &&
765                  BigTy.getSizeInBits() <= 512;
766         })
767       // Any vectors left are the wrong size. Scalarize them.
768       .scalarize(0)
769       .scalarize(1);
770   }
771 
772   computeTables();
773   verify(*ST.getInstrInfo());
774 }
775 
776 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
777                                          MachineRegisterInfo &MRI,
778                                          MachineIRBuilder &MIRBuilder,
779                                          GISelChangeObserver &Observer) const {
780   switch (MI.getOpcode()) {
781   case TargetOpcode::G_ADDRSPACE_CAST:
782     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
783   case TargetOpcode::G_FRINT:
784     return legalizeFrint(MI, MRI, MIRBuilder);
785   case TargetOpcode::G_FCEIL:
786     return legalizeFceil(MI, MRI, MIRBuilder);
787   case TargetOpcode::G_INTRINSIC_TRUNC:
788     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
789   case TargetOpcode::G_SITOFP:
790     return legalizeITOFP(MI, MRI, MIRBuilder, true);
791   case TargetOpcode::G_UITOFP:
792     return legalizeITOFP(MI, MRI, MIRBuilder, false);
793   case TargetOpcode::G_FMINNUM:
794   case TargetOpcode::G_FMAXNUM:
795   case TargetOpcode::G_FMINNUM_IEEE:
796   case TargetOpcode::G_FMAXNUM_IEEE:
797     return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
798   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
799     return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
800   case TargetOpcode::G_INSERT_VECTOR_ELT:
801     return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
802   default:
803     return false;
804   }
805 
806   llvm_unreachable("expected switch to return");
807 }
808 
809 Register AMDGPULegalizerInfo::getSegmentAperture(
810   unsigned AS,
811   MachineRegisterInfo &MRI,
812   MachineIRBuilder &MIRBuilder) const {
813   MachineFunction &MF = MIRBuilder.getMF();
814   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
815   const LLT S32 = LLT::scalar(32);
816 
817   if (ST.hasApertureRegs()) {
818     // FIXME: Use inline constants (src_{shared, private}_base) instead of
819     // getreg.
820     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
821         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
822         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
823     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
824         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
825         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
826     unsigned Encoding =
827         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
828         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
829         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
830 
831     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
832     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
833 
834     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
835       .addDef(GetReg)
836       .addImm(Encoding);
837     MRI.setType(GetReg, S32);
838 
839     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
840     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
841       .addDef(ApertureReg)
842       .addUse(GetReg)
843       .addUse(ShiftAmt.getReg(0));
844 
845     return ApertureReg;
846   }
847 
848   Register QueuePtr = MRI.createGenericVirtualRegister(
849     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
850 
851   // FIXME: Placeholder until we can track the input registers.
852   MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
853 
854   // Offset into amd_queue_t for group_segment_aperture_base_hi /
855   // private_segment_aperture_base_hi.
856   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
857 
858   // FIXME: Don't use undef
859   Value *V = UndefValue::get(PointerType::get(
860                                Type::getInt8Ty(MF.getFunction().getContext()),
861                                AMDGPUAS::CONSTANT_ADDRESS));
862 
863   MachinePointerInfo PtrInfo(V, StructOffset);
864   MachineMemOperand *MMO = MF.getMachineMemOperand(
865     PtrInfo,
866     MachineMemOperand::MOLoad |
867     MachineMemOperand::MODereferenceable |
868     MachineMemOperand::MOInvariant,
869     4,
870     MinAlign(64, StructOffset));
871 
872   Register LoadResult = MRI.createGenericVirtualRegister(S32);
873   Register LoadAddr;
874 
875   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
876   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
877   return LoadResult;
878 }
879 
880 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
881   MachineInstr &MI, MachineRegisterInfo &MRI,
882   MachineIRBuilder &MIRBuilder) const {
883   MachineFunction &MF = MIRBuilder.getMF();
884 
885   MIRBuilder.setInstr(MI);
886 
887   Register Dst = MI.getOperand(0).getReg();
888   Register Src = MI.getOperand(1).getReg();
889 
890   LLT DstTy = MRI.getType(Dst);
891   LLT SrcTy = MRI.getType(Src);
892   unsigned DestAS = DstTy.getAddressSpace();
893   unsigned SrcAS = SrcTy.getAddressSpace();
894 
895   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
896   // vector element.
897   assert(!DstTy.isVector());
898 
899   const AMDGPUTargetMachine &TM
900     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
901 
902   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
903   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
904     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
905     return true;
906   }
907 
908   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
909     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
910            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
911     unsigned NullVal = TM.getNullPointerValue(DestAS);
912 
913     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
914     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
915 
916     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
917 
918     // Extract low 32-bits of the pointer.
919     MIRBuilder.buildExtract(PtrLo32, Src, 0);
920 
921     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
922     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
923     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
924 
925     MI.eraseFromParent();
926     return true;
927   }
928 
929   assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
930          SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
931 
932   auto SegmentNull =
933       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
934   auto FlatNull =
935       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
936 
937   Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
938 
939   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
940   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
941 
942   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
943 
944   // Coerce the type of the low half of the result so we can use merge_values.
945   Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
946   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
947     .addDef(SrcAsInt)
948     .addUse(Src);
949 
950   // TODO: Should we allow mismatched types but matching sizes in merges to
951   // avoid the ptrtoint?
952   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
953   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
954 
955   MI.eraseFromParent();
956   return true;
957 }
958 
959 bool AMDGPULegalizerInfo::legalizeFrint(
960   MachineInstr &MI, MachineRegisterInfo &MRI,
961   MachineIRBuilder &MIRBuilder) const {
962   MIRBuilder.setInstr(MI);
963 
964   Register Src = MI.getOperand(1).getReg();
965   LLT Ty = MRI.getType(Src);
966   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
967 
968   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
969   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
970 
971   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
972   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
973 
974   // TODO: Should this propagate fast-math-flags?
975   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
976   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
977 
978   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
979   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
980 
981   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
982   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
983   return true;
984 }
985 
986 bool AMDGPULegalizerInfo::legalizeFceil(
987   MachineInstr &MI, MachineRegisterInfo &MRI,
988   MachineIRBuilder &B) const {
989   B.setInstr(MI);
990 
991   const LLT S1 = LLT::scalar(1);
992   const LLT S64 = LLT::scalar(64);
993 
994   Register Src = MI.getOperand(1).getReg();
995   assert(MRI.getType(Src) == S64);
996 
997   // result = trunc(src)
998   // if (src > 0.0 && src != result)
999   //   result += 1.0
1000 
1001   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1002 
1003   const auto Zero = B.buildFConstant(S64, 0.0);
1004   const auto One = B.buildFConstant(S64, 1.0);
1005   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1006   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1007   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1008   auto Add = B.buildSelect(S64, And, One, Zero);
1009 
1010   // TODO: Should this propagate fast-math-flags?
1011   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1012   return true;
1013 }
1014 
1015 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1016                                               MachineIRBuilder &B) {
1017   const unsigned FractBits = 52;
1018   const unsigned ExpBits = 11;
1019   LLT S32 = LLT::scalar(32);
1020 
1021   auto Const0 = B.buildConstant(S32, FractBits - 32);
1022   auto Const1 = B.buildConstant(S32, ExpBits);
1023 
1024   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1025     .addUse(Const0.getReg(0))
1026     .addUse(Const1.getReg(0));
1027 
1028   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1029 }
1030 
1031 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1032   MachineInstr &MI, MachineRegisterInfo &MRI,
1033   MachineIRBuilder &B) const {
1034   B.setInstr(MI);
1035 
1036   const LLT S1 = LLT::scalar(1);
1037   const LLT S32 = LLT::scalar(32);
1038   const LLT S64 = LLT::scalar(64);
1039 
1040   Register Src = MI.getOperand(1).getReg();
1041   assert(MRI.getType(Src) == S64);
1042 
1043   // TODO: Should this use extract since the low half is unused?
1044   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1045   Register Hi = Unmerge.getReg(1);
1046 
1047   // Extract the upper half, since this is where we will find the sign and
1048   // exponent.
1049   auto Exp = extractF64Exponent(Hi, B);
1050 
1051   const unsigned FractBits = 52;
1052 
1053   // Extract the sign bit.
1054   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1055   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1056 
1057   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1058 
1059   const auto Zero32 = B.buildConstant(S32, 0);
1060 
1061   // Extend back to 64-bits.
1062   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1063 
1064   auto Shr = B.buildAShr(S64, FractMask, Exp);
1065   auto Not = B.buildNot(S64, Shr);
1066   auto Tmp0 = B.buildAnd(S64, Src, Not);
1067   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1068 
1069   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1070   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1071 
1072   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1073   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1074   return true;
1075 }
1076 
1077 bool AMDGPULegalizerInfo::legalizeITOFP(
1078   MachineInstr &MI, MachineRegisterInfo &MRI,
1079   MachineIRBuilder &B, bool Signed) const {
1080   B.setInstr(MI);
1081 
1082   Register Dst = MI.getOperand(0).getReg();
1083   Register Src = MI.getOperand(1).getReg();
1084 
1085   const LLT S64 = LLT::scalar(64);
1086   const LLT S32 = LLT::scalar(32);
1087 
1088   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1089 
1090   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1091 
1092   auto CvtHi = Signed ?
1093     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1094     B.buildUITOFP(S64, Unmerge.getReg(1));
1095 
1096   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1097 
1098   auto ThirtyTwo = B.buildConstant(S32, 32);
1099   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1100     .addUse(CvtHi.getReg(0))
1101     .addUse(ThirtyTwo.getReg(0));
1102 
1103   // TODO: Should this propagate fast-math-flags?
1104   B.buildFAdd(Dst, LdExp, CvtLo);
1105   MI.eraseFromParent();
1106   return true;
1107 }
1108 
1109 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1110   MachineInstr &MI, MachineRegisterInfo &MRI,
1111   MachineIRBuilder &B) const {
1112   MachineFunction &MF = B.getMF();
1113   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1114 
1115   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1116                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1117 
1118   // With ieee_mode disabled, the instructions have the correct behavior
1119   // already for G_FMINNUM/G_FMAXNUM
1120   if (!MFI->getMode().IEEE)
1121     return !IsIEEEOp;
1122 
1123   if (IsIEEEOp)
1124     return true;
1125 
1126   MachineIRBuilder HelperBuilder(MI);
1127   GISelObserverWrapper DummyObserver;
1128   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1129   HelperBuilder.setMBB(*MI.getParent());
1130   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1131 }
1132 
1133 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1134   MachineInstr &MI, MachineRegisterInfo &MRI,
1135   MachineIRBuilder &B) const {
1136   // TODO: Should move some of this into LegalizerHelper.
1137 
1138   // TODO: Promote dynamic indexing of s16 to s32
1139   // TODO: Dynamic s64 indexing is only legal for SGPR.
1140   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1141   if (!IdxVal) // Dynamic case will be selected to register indexing.
1142     return true;
1143 
1144   Register Dst = MI.getOperand(0).getReg();
1145   Register Vec = MI.getOperand(1).getReg();
1146 
1147   LLT VecTy = MRI.getType(Vec);
1148   LLT EltTy = VecTy.getElementType();
1149   assert(EltTy == MRI.getType(Dst));
1150 
1151   B.setInstr(MI);
1152 
1153   if (IdxVal.getValue() < VecTy.getNumElements())
1154     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1155   else
1156     B.buildUndef(Dst);
1157 
1158   MI.eraseFromParent();
1159   return true;
1160 }
1161 
1162 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1163   MachineInstr &MI, MachineRegisterInfo &MRI,
1164   MachineIRBuilder &B) const {
1165   // TODO: Should move some of this into LegalizerHelper.
1166 
1167   // TODO: Promote dynamic indexing of s16 to s32
1168   // TODO: Dynamic s64 indexing is only legal for SGPR.
1169   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1170   if (!IdxVal) // Dynamic case will be selected to register indexing.
1171     return true;
1172 
1173   Register Dst = MI.getOperand(0).getReg();
1174   Register Vec = MI.getOperand(1).getReg();
1175   Register Ins = MI.getOperand(2).getReg();
1176 
1177   LLT VecTy = MRI.getType(Vec);
1178   LLT EltTy = VecTy.getElementType();
1179   assert(EltTy == MRI.getType(Ins));
1180 
1181   B.setInstr(MI);
1182 
1183   if (IdxVal.getValue() < VecTy.getNumElements())
1184     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1185   else
1186     B.buildUndef(Dst);
1187 
1188   MI.eraseFromParent();
1189   return true;
1190 }
1191 
1192 // Return the use branch instruction, otherwise null if the usage is invalid.
1193 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1194                                        MachineRegisterInfo &MRI) {
1195   Register CondDef = MI.getOperand(0).getReg();
1196   if (!MRI.hasOneNonDBGUse(CondDef))
1197     return nullptr;
1198 
1199   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1200   return UseMI.getParent() == MI.getParent() &&
1201     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1202 }
1203 
1204 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1205                                                 Register Reg, LLT Ty) const {
1206   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1207   if (LiveIn)
1208     return LiveIn;
1209 
1210   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1211   MRI.addLiveIn(Reg, NewReg);
1212   return NewReg;
1213 }
1214 
1215 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1216                                          const ArgDescriptor *Arg) const {
1217   if (!Arg->isRegister())
1218     return false; // TODO: Handle these
1219 
1220   assert(Arg->getRegister() != 0);
1221   assert(Arg->getRegister().isPhysical());
1222 
1223   MachineRegisterInfo &MRI = *B.getMRI();
1224 
1225   LLT Ty = MRI.getType(DstReg);
1226   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1227 
1228   if (Arg->isMasked()) {
1229     // TODO: Should we try to emit this once in the entry block?
1230     const LLT S32 = LLT::scalar(32);
1231     const unsigned Mask = Arg->getMask();
1232     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1233 
1234     auto ShiftAmt = B.buildConstant(S32, Shift);
1235     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1236     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1237   } else
1238     B.buildCopy(DstReg, LiveIn);
1239 
1240   // Insert the argument copy if it doens't already exist.
1241   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1242   if (!MRI.getVRegDef(LiveIn)) {
1243     MachineBasicBlock &EntryMBB = B.getMF().front();
1244     EntryMBB.addLiveIn(Arg->getRegister());
1245     B.setInsertPt(EntryMBB, EntryMBB.begin());
1246     B.buildCopy(LiveIn, Arg->getRegister());
1247   }
1248 
1249   return true;
1250 }
1251 
1252 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1253   MachineInstr &MI,
1254   MachineRegisterInfo &MRI,
1255   MachineIRBuilder &B,
1256   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1257   B.setInstr(MI);
1258 
1259   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1260 
1261   const ArgDescriptor *Arg;
1262   const TargetRegisterClass *RC;
1263   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1264   if (!Arg) {
1265     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1266     return false;
1267   }
1268 
1269   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1270     MI.eraseFromParent();
1271     return true;
1272   }
1273 
1274   return false;
1275 }
1276 
1277 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1278                                                  MachineRegisterInfo &MRI,
1279                                                  MachineIRBuilder &B) const {
1280   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1281   if (!MFI->isEntryFunction()) {
1282     return legalizePreloadedArgIntrin(MI, MRI, B,
1283                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1284   }
1285 
1286   B.setInstr(MI);
1287 
1288   uint64_t Offset =
1289     ST.getTargetLowering()->getImplicitParameterOffset(
1290       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1291   Register DstReg = MI.getOperand(0).getReg();
1292   LLT DstTy = MRI.getType(DstReg);
1293   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1294 
1295   const ArgDescriptor *Arg;
1296   const TargetRegisterClass *RC;
1297   std::tie(Arg, RC)
1298     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1299   if (!Arg)
1300     return false;
1301 
1302   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1303   if (!loadInputValue(KernargPtrReg, B, Arg))
1304     return false;
1305 
1306   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1307   MI.eraseFromParent();
1308   return true;
1309 }
1310 
1311 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1312                                             MachineRegisterInfo &MRI,
1313                                             MachineIRBuilder &B) const {
1314   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1315   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1316   case Intrinsic::amdgcn_if: {
1317     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1318       const SIRegisterInfo *TRI
1319         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1320 
1321       B.setInstr(*BrCond);
1322       Register Def = MI.getOperand(1).getReg();
1323       Register Use = MI.getOperand(3).getReg();
1324       B.buildInstr(AMDGPU::SI_IF)
1325         .addDef(Def)
1326         .addUse(Use)
1327         .addMBB(BrCond->getOperand(1).getMBB());
1328 
1329       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1330       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1331       MI.eraseFromParent();
1332       BrCond->eraseFromParent();
1333       return true;
1334     }
1335 
1336     return false;
1337   }
1338   case Intrinsic::amdgcn_loop: {
1339     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1340       const SIRegisterInfo *TRI
1341         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1342 
1343       B.setInstr(*BrCond);
1344       Register Reg = MI.getOperand(2).getReg();
1345       B.buildInstr(AMDGPU::SI_LOOP)
1346         .addUse(Reg)
1347         .addMBB(BrCond->getOperand(1).getMBB());
1348       MI.eraseFromParent();
1349       BrCond->eraseFromParent();
1350       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1351       return true;
1352     }
1353 
1354     return false;
1355   }
1356   case Intrinsic::amdgcn_kernarg_segment_ptr:
1357     return legalizePreloadedArgIntrin(
1358       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1359   case Intrinsic::amdgcn_implicitarg_ptr:
1360     return legalizeImplicitArgPtr(MI, MRI, B);
1361   case Intrinsic::amdgcn_workitem_id_x:
1362     return legalizePreloadedArgIntrin(MI, MRI, B,
1363                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1364   case Intrinsic::amdgcn_workitem_id_y:
1365     return legalizePreloadedArgIntrin(MI, MRI, B,
1366                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1367   case Intrinsic::amdgcn_workitem_id_z:
1368     return legalizePreloadedArgIntrin(MI, MRI, B,
1369                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1370   case Intrinsic::amdgcn_workgroup_id_x:
1371     return legalizePreloadedArgIntrin(MI, MRI, B,
1372                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1373   case Intrinsic::amdgcn_workgroup_id_y:
1374     return legalizePreloadedArgIntrin(MI, MRI, B,
1375                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1376   case Intrinsic::amdgcn_workgroup_id_z:
1377     return legalizePreloadedArgIntrin(MI, MRI, B,
1378                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1379   case Intrinsic::amdgcn_dispatch_ptr:
1380     return legalizePreloadedArgIntrin(MI, MRI, B,
1381                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1382   case Intrinsic::amdgcn_queue_ptr:
1383     return legalizePreloadedArgIntrin(MI, MRI, B,
1384                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1385   case Intrinsic::amdgcn_implicit_buffer_ptr:
1386     return legalizePreloadedArgIntrin(
1387       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1388   case Intrinsic::amdgcn_dispatch_id:
1389     return legalizePreloadedArgIntrin(MI, MRI, B,
1390                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1391   default:
1392     return true;
1393   }
1394 
1395   return true;
1396 }
1397