1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/TargetOpcodes.h"
21 #include "llvm/CodeGen/ValueTypes.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/Support/Debug.h"
25 
26 #define DEBUG_TYPE "amdgpu-legalinfo"
27 
28 using namespace llvm;
29 using namespace LegalizeActions;
30 using namespace LegalizeMutations;
31 using namespace LegalityPredicates;
32 
33 
34 static LegalityPredicate isMultiple32(unsigned TypeIdx,
35                                       unsigned MaxSize = 512) {
36   return [=](const LegalityQuery &Query) {
37     const LLT Ty = Query.Types[TypeIdx];
38     const LLT EltTy = Ty.getScalarType();
39     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
40   };
41 }
42 
43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     return Ty.isVector() &&
47            Ty.getNumElements() % 2 != 0 &&
48            Ty.getElementType().getSizeInBits() < 32;
49   };
50 }
51 
52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
53   return [=](const LegalityQuery &Query) {
54     const LLT Ty = Query.Types[TypeIdx];
55     const LLT EltTy = Ty.getElementType();
56     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
57   };
58 }
59 
60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getElementType();
64     unsigned Size = Ty.getSizeInBits();
65     unsigned Pieces = (Size + 63) / 64;
66     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
67     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
68   };
69 }
70 
71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
72   return [=](const LegalityQuery &Query) {
73     const LLT QueryTy = Query.Types[TypeIdx];
74     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
75   };
76 }
77 
78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
79   return [=](const LegalityQuery &Query) {
80     const LLT QueryTy = Query.Types[TypeIdx];
81     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
82   };
83 }
84 
85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
86 // v2s16.
87 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     if (Ty.isVector()) {
91       const int EltSize = Ty.getElementType().getSizeInBits();
92       return EltSize == 32 || EltSize == 64 ||
93             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
94              EltSize == 128 || EltSize == 256;
95     }
96 
97     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
98   };
99 }
100 
101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
102                                          const GCNTargetMachine &TM)
103   :  ST(ST_) {
104   using namespace TargetOpcode;
105 
106   auto GetAddrSpacePtr = [&TM](unsigned AS) {
107     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
108   };
109 
110   const LLT S1 = LLT::scalar(1);
111   const LLT S8 = LLT::scalar(8);
112   const LLT S16 = LLT::scalar(16);
113   const LLT S32 = LLT::scalar(32);
114   const LLT S64 = LLT::scalar(64);
115   const LLT S128 = LLT::scalar(128);
116   const LLT S256 = LLT::scalar(256);
117   const LLT S512 = LLT::scalar(512);
118 
119   const LLT V2S16 = LLT::vector(2, 16);
120   const LLT V4S16 = LLT::vector(4, 16);
121 
122   const LLT V2S32 = LLT::vector(2, 32);
123   const LLT V3S32 = LLT::vector(3, 32);
124   const LLT V4S32 = LLT::vector(4, 32);
125   const LLT V5S32 = LLT::vector(5, 32);
126   const LLT V6S32 = LLT::vector(6, 32);
127   const LLT V7S32 = LLT::vector(7, 32);
128   const LLT V8S32 = LLT::vector(8, 32);
129   const LLT V9S32 = LLT::vector(9, 32);
130   const LLT V10S32 = LLT::vector(10, 32);
131   const LLT V11S32 = LLT::vector(11, 32);
132   const LLT V12S32 = LLT::vector(12, 32);
133   const LLT V13S32 = LLT::vector(13, 32);
134   const LLT V14S32 = LLT::vector(14, 32);
135   const LLT V15S32 = LLT::vector(15, 32);
136   const LLT V16S32 = LLT::vector(16, 32);
137 
138   const LLT V2S64 = LLT::vector(2, 64);
139   const LLT V3S64 = LLT::vector(3, 64);
140   const LLT V4S64 = LLT::vector(4, 64);
141   const LLT V5S64 = LLT::vector(5, 64);
142   const LLT V6S64 = LLT::vector(6, 64);
143   const LLT V7S64 = LLT::vector(7, 64);
144   const LLT V8S64 = LLT::vector(8, 64);
145 
146   std::initializer_list<LLT> AllS32Vectors =
147     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
148      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
149   std::initializer_list<LLT> AllS64Vectors =
150     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
151 
152   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
153   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
154   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
155   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
156   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
157 
158   const LLT CodePtr = FlatPtr;
159 
160   const std::initializer_list<LLT> AddrSpaces64 = {
161     GlobalPtr, ConstantPtr, FlatPtr
162   };
163 
164   const std::initializer_list<LLT> AddrSpaces32 = {
165     LocalPtr, PrivatePtr
166   };
167 
168   const std::initializer_list<LLT> FPTypesBase = {
169     S32, S64
170   };
171 
172   const std::initializer_list<LLT> FPTypes16 = {
173     S32, S64, S16
174   };
175 
176   const std::initializer_list<LLT> FPTypesPK16 = {
177     S32, S64, S16, V2S16
178   };
179 
180   setAction({G_BRCOND, S1}, Legal);
181 
182   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
183   // elements for v3s16
184   getActionDefinitionsBuilder(G_PHI)
185     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
186     .legalFor(AllS32Vectors)
187     .legalFor(AllS64Vectors)
188     .legalFor(AddrSpaces64)
189     .legalFor(AddrSpaces32)
190     .clampScalar(0, S32, S256)
191     .widenScalarToNextPow2(0, 32)
192     .clampMaxNumElements(0, S32, 16)
193     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
194     .legalIf(isPointer(0));
195 
196   if (ST.has16BitInsts()) {
197     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
198       .legalFor({S32, S16})
199       .clampScalar(0, S16, S32)
200       .scalarize(0);
201   } else {
202     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
203       .legalFor({S32})
204       .clampScalar(0, S32, S32)
205       .scalarize(0);
206   }
207 
208   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
209     .legalFor({S32})
210     .clampScalar(0, S32, S32)
211     .scalarize(0);
212 
213   // Report legal for any types we can handle anywhere. For the cases only legal
214   // on the SALU, RegBankSelect will be able to re-legalize.
215   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
216     .legalFor({S32, S1, S64, V2S32, V2S16, V4S16})
217     .clampScalar(0, S32, S64)
218     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
219     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
220     .widenScalarToNextPow2(0)
221     .scalarize(0);
222 
223   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
224                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
225     .legalFor({{S32, S1}})
226     .clampScalar(0, S32, S32);
227 
228   getActionDefinitionsBuilder(G_BITCAST)
229     .legalForCartesianProduct({S32, V2S16})
230     .legalForCartesianProduct({S64, V2S32, V4S16})
231     .legalForCartesianProduct({V2S64, V4S32})
232     // Don't worry about the size constraint.
233     .legalIf(all(isPointer(0), isPointer(1)));
234 
235   if (ST.has16BitInsts()) {
236     getActionDefinitionsBuilder(G_FCONSTANT)
237       .legalFor({S32, S64, S16})
238       .clampScalar(0, S16, S64);
239   } else {
240     getActionDefinitionsBuilder(G_FCONSTANT)
241       .legalFor({S32, S64})
242       .clampScalar(0, S32, S64);
243   }
244 
245   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
246     .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
247                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
248     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
249     .clampScalarOrElt(0, S32, S512)
250     .legalIf(isMultiple32(0))
251     .widenScalarToNextPow2(0, 32)
252     .clampMaxNumElements(0, S32, 16);
253 
254 
255   // FIXME: i1 operands to intrinsics should always be legal, but other i1
256   // values may not be legal.  We need to figure out how to distinguish
257   // between these two scenarios.
258   getActionDefinitionsBuilder(G_CONSTANT)
259     .legalFor({S1, S32, S64, GlobalPtr,
260                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
261     .clampScalar(0, S32, S64)
262     .widenScalarToNextPow2(0)
263     .legalIf(isPointer(0));
264 
265   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
266 
267   auto &FPOpActions = getActionDefinitionsBuilder(
268     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
269     .legalFor({S32, S64});
270 
271   if (ST.has16BitInsts()) {
272     if (ST.hasVOP3PInsts())
273       FPOpActions.legalFor({S16, V2S16});
274     else
275       FPOpActions.legalFor({S16});
276   }
277 
278   auto &MinNumMaxNum = getActionDefinitionsBuilder({
279       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
280 
281   if (ST.hasVOP3PInsts()) {
282     MinNumMaxNum.customFor(FPTypesPK16)
283       .clampMaxNumElements(0, S16, 2)
284       .clampScalar(0, S16, S64)
285       .scalarize(0);
286   } else if (ST.has16BitInsts()) {
287     MinNumMaxNum.customFor(FPTypes16)
288       .clampScalar(0, S16, S64)
289       .scalarize(0);
290   } else {
291     MinNumMaxNum.customFor(FPTypesBase)
292       .clampScalar(0, S32, S64)
293       .scalarize(0);
294   }
295 
296   // TODO: Implement
297   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
298 
299   if (ST.hasVOP3PInsts())
300     FPOpActions.clampMaxNumElements(0, S16, 2);
301   FPOpActions
302     .scalarize(0)
303     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
304 
305   if (ST.has16BitInsts()) {
306     getActionDefinitionsBuilder(G_FSQRT)
307       .legalFor({S32, S64, S16})
308       .scalarize(0)
309       .clampScalar(0, S16, S64);
310   } else {
311     getActionDefinitionsBuilder(G_FSQRT)
312       .legalFor({S32, S64})
313       .scalarize(0)
314       .clampScalar(0, S32, S64);
315   }
316 
317   getActionDefinitionsBuilder(G_FPTRUNC)
318     .legalFor({{S32, S64}, {S16, S32}})
319     .scalarize(0);
320 
321   getActionDefinitionsBuilder(G_FPEXT)
322     .legalFor({{S64, S32}, {S32, S16}})
323     .lowerFor({{S64, S16}}) // FIXME: Implement
324     .scalarize(0);
325 
326   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
327   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
328 
329   getActionDefinitionsBuilder(G_FSUB)
330       // Use actual fsub instruction
331       .legalFor({S32})
332       // Must use fadd + fneg
333       .lowerFor({S64, S16, V2S16})
334       .scalarize(0)
335       .clampScalar(0, S32, S64);
336 
337   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
338     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
339                {S32, S1}, {S64, S1}, {S16, S1},
340                // FIXME: Hack
341                {S64, LLT::scalar(33)},
342                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
343     .scalarize(0);
344 
345   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
346     .legalFor({{S32, S32}, {S64, S32}})
347     .lowerFor({{S32, S64}})
348     .customFor({{S64, S64}})
349     .scalarize(0);
350 
351   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
352     .legalFor({{S32, S32}, {S32, S64}})
353     .scalarize(0);
354 
355   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
356     .legalFor({S32, S64})
357     .scalarize(0);
358 
359   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
360     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
361       .legalFor({S32, S64})
362       .clampScalar(0, S32, S64)
363       .scalarize(0);
364   } else {
365     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
366       .legalFor({S32})
367       .customFor({S64})
368       .clampScalar(0, S32, S64)
369       .scalarize(0);
370   }
371 
372   getActionDefinitionsBuilder(G_GEP)
373     .legalForCartesianProduct(AddrSpaces64, {S64})
374     .legalForCartesianProduct(AddrSpaces32, {S32})
375     .scalarize(0);
376 
377   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
378 
379   auto &CmpBuilder =
380     getActionDefinitionsBuilder(G_ICMP)
381     .legalForCartesianProduct(
382       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
383     .legalFor({{S1, S32}, {S1, S64}});
384   if (ST.has16BitInsts()) {
385     CmpBuilder.legalFor({{S1, S16}});
386   }
387 
388   CmpBuilder
389     .widenScalarToNextPow2(1)
390     .clampScalar(1, S32, S64)
391     .scalarize(0)
392     .legalIf(all(typeIs(0, S1), isPointer(1)));
393 
394   getActionDefinitionsBuilder(G_FCMP)
395     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
396     .widenScalarToNextPow2(1)
397     .clampScalar(1, S32, S64)
398     .scalarize(0);
399 
400   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
401   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
402                                G_FLOG, G_FLOG2, G_FLOG10})
403     .legalFor({S32})
404     .scalarize(0);
405 
406   // The 64-bit versions produce 32-bit results, but only on the SALU.
407   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
408                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
409                                G_CTPOP})
410     .legalFor({{S32, S32}, {S32, S64}})
411     .clampScalar(0, S32, S32)
412     .clampScalar(1, S32, S64)
413     .scalarize(0)
414     .widenScalarToNextPow2(0, 32)
415     .widenScalarToNextPow2(1, 32);
416 
417   // TODO: Expand for > s32
418   getActionDefinitionsBuilder(G_BSWAP)
419     .legalFor({S32})
420     .clampScalar(0, S32, S32)
421     .scalarize(0);
422 
423   if (ST.has16BitInsts()) {
424     if (ST.hasVOP3PInsts()) {
425       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
426         .legalFor({S32, S16, V2S16})
427         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
428         .clampMaxNumElements(0, S16, 2)
429         .clampScalar(0, S16, S32)
430         .widenScalarToNextPow2(0)
431         .scalarize(0);
432     } else {
433       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
434         .legalFor({S32, S16})
435         .widenScalarToNextPow2(0)
436         .clampScalar(0, S16, S32)
437         .scalarize(0);
438     }
439   } else {
440     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
441       .legalFor({S32})
442       .clampScalar(0, S32, S32)
443       .widenScalarToNextPow2(0)
444       .scalarize(0);
445   }
446 
447   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
448     return [=](const LegalityQuery &Query) {
449       return Query.Types[TypeIdx0].getSizeInBits() <
450              Query.Types[TypeIdx1].getSizeInBits();
451     };
452   };
453 
454   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
455     return [=](const LegalityQuery &Query) {
456       return Query.Types[TypeIdx0].getSizeInBits() >
457              Query.Types[TypeIdx1].getSizeInBits();
458     };
459   };
460 
461   getActionDefinitionsBuilder(G_INTTOPTR)
462     // List the common cases
463     .legalForCartesianProduct(AddrSpaces64, {S64})
464     .legalForCartesianProduct(AddrSpaces32, {S32})
465     .scalarize(0)
466     // Accept any address space as long as the size matches
467     .legalIf(sameSize(0, 1))
468     .widenScalarIf(smallerThan(1, 0),
469       [](const LegalityQuery &Query) {
470         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
471       })
472     .narrowScalarIf(greaterThan(1, 0),
473       [](const LegalityQuery &Query) {
474         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
475       });
476 
477   getActionDefinitionsBuilder(G_PTRTOINT)
478     // List the common cases
479     .legalForCartesianProduct(AddrSpaces64, {S64})
480     .legalForCartesianProduct(AddrSpaces32, {S32})
481     .scalarize(0)
482     // Accept any address space as long as the size matches
483     .legalIf(sameSize(0, 1))
484     .widenScalarIf(smallerThan(0, 1),
485       [](const LegalityQuery &Query) {
486         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
487       })
488     .narrowScalarIf(
489       greaterThan(0, 1),
490       [](const LegalityQuery &Query) {
491         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
492       });
493 
494   if (ST.hasFlatAddressSpace()) {
495     getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
496       .scalarize(0)
497       .custom();
498   }
499 
500   getActionDefinitionsBuilder({G_LOAD, G_STORE})
501     .narrowScalarIf([](const LegalityQuery &Query) {
502         unsigned Size = Query.Types[0].getSizeInBits();
503         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
504         return (Size > 32 && MemSize < Size);
505       },
506       [](const LegalityQuery &Query) {
507         return std::make_pair(0, LLT::scalar(32));
508       })
509     .fewerElementsIf([=](const LegalityQuery &Query) {
510         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
511         return (MemSize == 96) &&
512                Query.Types[0].isVector() &&
513                !ST.hasDwordx3LoadStores();
514       },
515       [=](const LegalityQuery &Query) {
516         return std::make_pair(0, V2S32);
517       })
518     .legalIf([=](const LegalityQuery &Query) {
519         const LLT &Ty0 = Query.Types[0];
520 
521         unsigned Size = Ty0.getSizeInBits();
522         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
523         if (Size < 32 || (Size > 32 && MemSize < Size))
524           return false;
525 
526         if (Ty0.isVector() && Size != MemSize)
527           return false;
528 
529         // TODO: Decompose private loads into 4-byte components.
530         // TODO: Illegal flat loads on SI
531         switch (MemSize) {
532         case 8:
533         case 16:
534           return Size == 32;
535         case 32:
536         case 64:
537         case 128:
538           return true;
539 
540         case 96:
541           return ST.hasDwordx3LoadStores();
542 
543         case 256:
544         case 512:
545           // TODO: Possibly support loads of i256 and i512 .  This will require
546           // adding i256 and i512 types to MVT in order for to be able to use
547           // TableGen.
548           // TODO: Add support for other vector types, this will require
549           //       defining more value mappings for the new types.
550           return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
551                                     Ty0.getScalarType().getSizeInBits() == 64);
552 
553         default:
554           return false;
555         }
556       })
557     .clampScalar(0, S32, S64);
558 
559 
560   // FIXME: Handle alignment requirements.
561   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
562     .legalForTypesWithMemDesc({
563         {S32, GlobalPtr, 8, 8},
564         {S32, GlobalPtr, 16, 8},
565         {S32, LocalPtr, 8, 8},
566         {S32, LocalPtr, 16, 8},
567         {S32, PrivatePtr, 8, 8},
568         {S32, PrivatePtr, 16, 8}});
569   if (ST.hasFlatAddressSpace()) {
570     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
571                                        {S32, FlatPtr, 16, 8}});
572   }
573 
574   ExtLoads.clampScalar(0, S32, S32)
575           .widenScalarToNextPow2(0)
576           .unsupportedIfMemSizeNotPow2()
577           .lower();
578 
579   auto &Atomics = getActionDefinitionsBuilder(
580     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
581      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
582      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
583      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
584     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
585                {S64, GlobalPtr}, {S64, LocalPtr}});
586   if (ST.hasFlatAddressSpace()) {
587     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
588   }
589 
590   // TODO: Pointer types, any 32-bit or 64-bit vector
591   getActionDefinitionsBuilder(G_SELECT)
592     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
593           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
594           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
595     .clampScalar(0, S16, S64)
596     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
597     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
598     .scalarize(1)
599     .clampMaxNumElements(0, S32, 2)
600     .clampMaxNumElements(0, LocalPtr, 2)
601     .clampMaxNumElements(0, PrivatePtr, 2)
602     .scalarize(0)
603     .widenScalarToNextPow2(0)
604     .legalIf(all(isPointer(0), typeIs(1, S1)));
605 
606   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
607   // be more flexible with the shift amount type.
608   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
609     .legalFor({{S32, S32}, {S64, S32}});
610   if (ST.has16BitInsts()) {
611     if (ST.hasVOP3PInsts()) {
612       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
613             .clampMaxNumElements(0, S16, 2);
614     } else
615       Shifts.legalFor({{S16, S32}, {S16, S16}});
616 
617     Shifts.clampScalar(1, S16, S32);
618     Shifts.clampScalar(0, S16, S64);
619     Shifts.widenScalarToNextPow2(0, 16);
620   } else {
621     // Make sure we legalize the shift amount type first, as the general
622     // expansion for the shifted type will produce much worse code if it hasn't
623     // been truncated already.
624     Shifts.clampScalar(1, S32, S32);
625     Shifts.clampScalar(0, S32, S64);
626     Shifts.widenScalarToNextPow2(0, 32);
627   }
628   Shifts.scalarize(0);
629 
630   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
631     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
632     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
633     unsigned IdxTypeIdx = 2;
634 
635     getActionDefinitionsBuilder(Op)
636       .legalIf([=](const LegalityQuery &Query) {
637           const LLT &VecTy = Query.Types[VecTypeIdx];
638           const LLT &IdxTy = Query.Types[IdxTypeIdx];
639           return VecTy.getSizeInBits() % 32 == 0 &&
640             VecTy.getSizeInBits() <= 512 &&
641             IdxTy.getSizeInBits() == 32;
642         })
643       .clampScalar(EltTypeIdx, S32, S64)
644       .clampScalar(VecTypeIdx, S32, S64)
645       .clampScalar(IdxTypeIdx, S32, S32);
646   }
647 
648   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
649     .unsupportedIf([=](const LegalityQuery &Query) {
650         const LLT &EltTy = Query.Types[1].getElementType();
651         return Query.Types[0] != EltTy;
652       });
653 
654   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
655     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
656     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
657 
658     // FIXME: Doesn't handle extract of illegal sizes.
659     getActionDefinitionsBuilder(Op)
660       .legalIf([=](const LegalityQuery &Query) {
661           const LLT BigTy = Query.Types[BigTyIdx];
662           const LLT LitTy = Query.Types[LitTyIdx];
663           return (BigTy.getSizeInBits() % 32 == 0) &&
664                  (LitTy.getSizeInBits() % 16 == 0);
665         })
666       .widenScalarIf(
667         [=](const LegalityQuery &Query) {
668           const LLT BigTy = Query.Types[BigTyIdx];
669           return (BigTy.getScalarSizeInBits() < 16);
670         },
671         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
672       .widenScalarIf(
673         [=](const LegalityQuery &Query) {
674           const LLT LitTy = Query.Types[LitTyIdx];
675           return (LitTy.getScalarSizeInBits() < 16);
676         },
677         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
678       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
679       .widenScalarToNextPow2(BigTyIdx, 32);
680 
681   }
682 
683   getActionDefinitionsBuilder(G_BUILD_VECTOR)
684       .legalForCartesianProduct(AllS32Vectors, {S32})
685       .legalForCartesianProduct(AllS64Vectors, {S64})
686       .clampNumElements(0, V16S32, V16S32)
687       .clampNumElements(0, V2S64, V8S64)
688       .minScalarSameAs(1, 0)
689       .legalIf(isRegisterType(0))
690       .minScalarOrElt(0, S32);
691 
692   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
693     .legalIf(isRegisterType(0));
694 
695   // Merge/Unmerge
696   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
697     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
698     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
699 
700     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
701       const LLT &Ty = Query.Types[TypeIdx];
702       if (Ty.isVector()) {
703         const LLT &EltTy = Ty.getElementType();
704         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
705           return true;
706         if (!isPowerOf2_32(EltTy.getSizeInBits()))
707           return true;
708       }
709       return false;
710     };
711 
712     getActionDefinitionsBuilder(Op)
713       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
714       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
715       // worth considering the multiples of 64 since 2*192 and 2*384 are not
716       // valid.
717       .clampScalar(LitTyIdx, S16, S256)
718       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
719 
720       // Break up vectors with weird elements into scalars
721       .fewerElementsIf(
722         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
723         scalarize(0))
724       .fewerElementsIf(
725         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
726         scalarize(1))
727       .clampScalar(BigTyIdx, S32, S512)
728       .widenScalarIf(
729         [=](const LegalityQuery &Query) {
730           const LLT &Ty = Query.Types[BigTyIdx];
731           return !isPowerOf2_32(Ty.getSizeInBits()) &&
732                  Ty.getSizeInBits() % 16 != 0;
733         },
734         [=](const LegalityQuery &Query) {
735           // Pick the next power of 2, or a multiple of 64 over 128.
736           // Whichever is smaller.
737           const LLT &Ty = Query.Types[BigTyIdx];
738           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
739           if (NewSizeInBits >= 256) {
740             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
741             if (RoundedTo < NewSizeInBits)
742               NewSizeInBits = RoundedTo;
743           }
744           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
745         })
746       .legalIf([=](const LegalityQuery &Query) {
747           const LLT &BigTy = Query.Types[BigTyIdx];
748           const LLT &LitTy = Query.Types[LitTyIdx];
749 
750           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
751             return false;
752           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
753             return false;
754 
755           return BigTy.getSizeInBits() % 16 == 0 &&
756                  LitTy.getSizeInBits() % 16 == 0 &&
757                  BigTy.getSizeInBits() <= 512;
758         })
759       // Any vectors left are the wrong size. Scalarize them.
760       .scalarize(0)
761       .scalarize(1);
762   }
763 
764   computeTables();
765   verify(*ST.getInstrInfo());
766 }
767 
768 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
769                                          MachineRegisterInfo &MRI,
770                                          MachineIRBuilder &MIRBuilder,
771                                          GISelChangeObserver &Observer) const {
772   switch (MI.getOpcode()) {
773   case TargetOpcode::G_ADDRSPACE_CAST:
774     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
775   case TargetOpcode::G_FRINT:
776     return legalizeFrint(MI, MRI, MIRBuilder);
777   case TargetOpcode::G_FCEIL:
778     return legalizeFceil(MI, MRI, MIRBuilder);
779   case TargetOpcode::G_INTRINSIC_TRUNC:
780     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
781   case TargetOpcode::G_SITOFP:
782     return legalizeITOFP(MI, MRI, MIRBuilder, true);
783   case TargetOpcode::G_UITOFP:
784     return legalizeITOFP(MI, MRI, MIRBuilder, false);
785   case TargetOpcode::G_FMINNUM:
786   case TargetOpcode::G_FMAXNUM:
787   case TargetOpcode::G_FMINNUM_IEEE:
788   case TargetOpcode::G_FMAXNUM_IEEE:
789     return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
790   default:
791     return false;
792   }
793 
794   llvm_unreachable("expected switch to return");
795 }
796 
797 Register AMDGPULegalizerInfo::getSegmentAperture(
798   unsigned AS,
799   MachineRegisterInfo &MRI,
800   MachineIRBuilder &MIRBuilder) const {
801   MachineFunction &MF = MIRBuilder.getMF();
802   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
803   const LLT S32 = LLT::scalar(32);
804 
805   if (ST.hasApertureRegs()) {
806     // FIXME: Use inline constants (src_{shared, private}_base) instead of
807     // getreg.
808     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
809         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
810         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
811     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
812         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
813         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
814     unsigned Encoding =
815         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
816         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
817         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
818 
819     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
820     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
821 
822     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
823       .addDef(GetReg)
824       .addImm(Encoding);
825     MRI.setType(GetReg, S32);
826 
827     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
828     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
829       .addDef(ApertureReg)
830       .addUse(GetReg)
831       .addUse(ShiftAmt.getReg(0));
832 
833     return ApertureReg;
834   }
835 
836   Register QueuePtr = MRI.createGenericVirtualRegister(
837     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
838 
839   // FIXME: Placeholder until we can track the input registers.
840   MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
841 
842   // Offset into amd_queue_t for group_segment_aperture_base_hi /
843   // private_segment_aperture_base_hi.
844   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
845 
846   // FIXME: Don't use undef
847   Value *V = UndefValue::get(PointerType::get(
848                                Type::getInt8Ty(MF.getFunction().getContext()),
849                                AMDGPUAS::CONSTANT_ADDRESS));
850 
851   MachinePointerInfo PtrInfo(V, StructOffset);
852   MachineMemOperand *MMO = MF.getMachineMemOperand(
853     PtrInfo,
854     MachineMemOperand::MOLoad |
855     MachineMemOperand::MODereferenceable |
856     MachineMemOperand::MOInvariant,
857     4,
858     MinAlign(64, StructOffset));
859 
860   Register LoadResult = MRI.createGenericVirtualRegister(S32);
861   Register LoadAddr;
862 
863   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
864   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
865   return LoadResult;
866 }
867 
868 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
869   MachineInstr &MI, MachineRegisterInfo &MRI,
870   MachineIRBuilder &MIRBuilder) const {
871   MachineFunction &MF = MIRBuilder.getMF();
872 
873   MIRBuilder.setInstr(MI);
874 
875   Register Dst = MI.getOperand(0).getReg();
876   Register Src = MI.getOperand(1).getReg();
877 
878   LLT DstTy = MRI.getType(Dst);
879   LLT SrcTy = MRI.getType(Src);
880   unsigned DestAS = DstTy.getAddressSpace();
881   unsigned SrcAS = SrcTy.getAddressSpace();
882 
883   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
884   // vector element.
885   assert(!DstTy.isVector());
886 
887   const AMDGPUTargetMachine &TM
888     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
889 
890   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
891   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
892     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
893     return true;
894   }
895 
896   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
897     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
898            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
899     unsigned NullVal = TM.getNullPointerValue(DestAS);
900 
901     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
902     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
903 
904     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
905 
906     // Extract low 32-bits of the pointer.
907     MIRBuilder.buildExtract(PtrLo32, Src, 0);
908 
909     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
910     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
911     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
912 
913     MI.eraseFromParent();
914     return true;
915   }
916 
917   assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
918          SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
919 
920   auto SegmentNull =
921       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
922   auto FlatNull =
923       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
924 
925   Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
926 
927   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
928   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
929 
930   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
931 
932   // Coerce the type of the low half of the result so we can use merge_values.
933   Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
934   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
935     .addDef(SrcAsInt)
936     .addUse(Src);
937 
938   // TODO: Should we allow mismatched types but matching sizes in merges to
939   // avoid the ptrtoint?
940   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
941   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
942 
943   MI.eraseFromParent();
944   return true;
945 }
946 
947 bool AMDGPULegalizerInfo::legalizeFrint(
948   MachineInstr &MI, MachineRegisterInfo &MRI,
949   MachineIRBuilder &MIRBuilder) const {
950   MIRBuilder.setInstr(MI);
951 
952   Register Src = MI.getOperand(1).getReg();
953   LLT Ty = MRI.getType(Src);
954   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
955 
956   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
957   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
958 
959   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
960   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
961 
962   // TODO: Should this propagate fast-math-flags?
963   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
964   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
965 
966   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
967   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
968 
969   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
970   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
971   return true;
972 }
973 
974 bool AMDGPULegalizerInfo::legalizeFceil(
975   MachineInstr &MI, MachineRegisterInfo &MRI,
976   MachineIRBuilder &B) const {
977   B.setInstr(MI);
978 
979   const LLT S1 = LLT::scalar(1);
980   const LLT S64 = LLT::scalar(64);
981 
982   Register Src = MI.getOperand(1).getReg();
983   assert(MRI.getType(Src) == S64);
984 
985   // result = trunc(src)
986   // if (src > 0.0 && src != result)
987   //   result += 1.0
988 
989   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
990 
991   const auto Zero = B.buildFConstant(S64, 0.0);
992   const auto One = B.buildFConstant(S64, 1.0);
993   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
994   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
995   auto And = B.buildAnd(S1, Lt0, NeTrunc);
996   auto Add = B.buildSelect(S64, And, One, Zero);
997 
998   // TODO: Should this propagate fast-math-flags?
999   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1000   return true;
1001 }
1002 
1003 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1004                                               MachineIRBuilder &B) {
1005   const unsigned FractBits = 52;
1006   const unsigned ExpBits = 11;
1007   LLT S32 = LLT::scalar(32);
1008 
1009   auto Const0 = B.buildConstant(S32, FractBits - 32);
1010   auto Const1 = B.buildConstant(S32, ExpBits);
1011 
1012   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1013     .addUse(Const0.getReg(0))
1014     .addUse(Const1.getReg(0));
1015 
1016   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1017 }
1018 
1019 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1020   MachineInstr &MI, MachineRegisterInfo &MRI,
1021   MachineIRBuilder &B) const {
1022   B.setInstr(MI);
1023 
1024   const LLT S1 = LLT::scalar(1);
1025   const LLT S32 = LLT::scalar(32);
1026   const LLT S64 = LLT::scalar(64);
1027 
1028   Register Src = MI.getOperand(1).getReg();
1029   assert(MRI.getType(Src) == S64);
1030 
1031   // TODO: Should this use extract since the low half is unused?
1032   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1033   Register Hi = Unmerge.getReg(1);
1034 
1035   // Extract the upper half, since this is where we will find the sign and
1036   // exponent.
1037   auto Exp = extractF64Exponent(Hi, B);
1038 
1039   const unsigned FractBits = 52;
1040 
1041   // Extract the sign bit.
1042   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1043   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1044 
1045   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1046 
1047   const auto Zero32 = B.buildConstant(S32, 0);
1048 
1049   // Extend back to 64-bits.
1050   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1051 
1052   auto Shr = B.buildAShr(S64, FractMask, Exp);
1053   auto Not = B.buildNot(S64, Shr);
1054   auto Tmp0 = B.buildAnd(S64, Src, Not);
1055   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1056 
1057   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1058   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1059 
1060   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1061   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1062   return true;
1063 }
1064 
1065 bool AMDGPULegalizerInfo::legalizeITOFP(
1066   MachineInstr &MI, MachineRegisterInfo &MRI,
1067   MachineIRBuilder &B, bool Signed) const {
1068   B.setInstr(MI);
1069 
1070   Register Dst = MI.getOperand(0).getReg();
1071   Register Src = MI.getOperand(1).getReg();
1072 
1073   const LLT S64 = LLT::scalar(64);
1074   const LLT S32 = LLT::scalar(32);
1075 
1076   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1077 
1078   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1079 
1080   auto CvtHi = Signed ?
1081     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1082     B.buildUITOFP(S64, Unmerge.getReg(1));
1083 
1084   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1085 
1086   auto ThirtyTwo = B.buildConstant(S32, 32);
1087   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1088     .addUse(CvtHi.getReg(0))
1089     .addUse(ThirtyTwo.getReg(0));
1090 
1091   // TODO: Should this propagate fast-math-flags?
1092   B.buildFAdd(Dst, LdExp, CvtLo);
1093   MI.eraseFromParent();
1094   return true;
1095 }
1096 
1097 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1098   MachineInstr &MI, MachineRegisterInfo &MRI,
1099   MachineIRBuilder &B) const {
1100   MachineFunction &MF = B.getMF();
1101   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1102 
1103   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1104                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1105 
1106   // With ieee_mode disabled, the instructions have the correct behavior
1107   // already for G_FMINNUM/G_FMAXNUM
1108   if (!MFI->getMode().IEEE)
1109     return !IsIEEEOp;
1110 
1111   if (IsIEEEOp)
1112     return true;
1113 
1114   MachineIRBuilder HelperBuilder(MI);
1115   GISelObserverWrapper DummyObserver;
1116   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1117   HelperBuilder.setMBB(*MI.getParent());
1118   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1119 }
1120 
1121 // Return the use branch instruction, otherwise null if the usage is invalid.
1122 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1123                                        MachineRegisterInfo &MRI) {
1124   Register CondDef = MI.getOperand(0).getReg();
1125   if (!MRI.hasOneNonDBGUse(CondDef))
1126     return nullptr;
1127 
1128   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1129   return UseMI.getParent() == MI.getParent() &&
1130     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1131 }
1132 
1133 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1134                                                 Register Reg, LLT Ty) const {
1135   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1136   if (LiveIn)
1137     return LiveIn;
1138 
1139   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1140   MRI.addLiveIn(Reg, NewReg);
1141   return NewReg;
1142 }
1143 
1144 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1145                                          const ArgDescriptor *Arg) const {
1146   if (!Arg->isRegister())
1147     return false; // TODO: Handle these
1148 
1149   assert(Arg->getRegister() != 0);
1150   assert(Arg->getRegister().isPhysical());
1151 
1152   MachineRegisterInfo &MRI = *B.getMRI();
1153 
1154   LLT Ty = MRI.getType(DstReg);
1155   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1156 
1157   if (Arg->isMasked()) {
1158     // TODO: Should we try to emit this once in the entry block?
1159     const LLT S32 = LLT::scalar(32);
1160     const unsigned Mask = Arg->getMask();
1161     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1162 
1163     auto ShiftAmt = B.buildConstant(S32, Shift);
1164     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1165     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1166   } else
1167     B.buildCopy(DstReg, LiveIn);
1168 
1169   // Insert the argument copy if it doens't already exist.
1170   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1171   if (!MRI.getVRegDef(LiveIn)) {
1172     MachineBasicBlock &EntryMBB = B.getMF().front();
1173     EntryMBB.addLiveIn(Arg->getRegister());
1174     B.setInsertPt(EntryMBB, EntryMBB.begin());
1175     B.buildCopy(LiveIn, Arg->getRegister());
1176   }
1177 
1178   return true;
1179 }
1180 
1181 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1182   MachineInstr &MI,
1183   MachineRegisterInfo &MRI,
1184   MachineIRBuilder &B,
1185   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1186   B.setInstr(MI);
1187 
1188   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1189 
1190   const ArgDescriptor *Arg;
1191   const TargetRegisterClass *RC;
1192   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1193   if (!Arg) {
1194     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1195     return false;
1196   }
1197 
1198   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1199     MI.eraseFromParent();
1200     return true;
1201   }
1202 
1203   return false;
1204 }
1205 
1206 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1207                                                  MachineRegisterInfo &MRI,
1208                                                  MachineIRBuilder &B) const {
1209   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1210   if (!MFI->isEntryFunction()) {
1211     return legalizePreloadedArgIntrin(MI, MRI, B,
1212                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1213   }
1214 
1215   B.setInstr(MI);
1216 
1217   uint64_t Offset =
1218     ST.getTargetLowering()->getImplicitParameterOffset(
1219       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1220   Register DstReg = MI.getOperand(0).getReg();
1221   LLT DstTy = MRI.getType(DstReg);
1222   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1223 
1224   const ArgDescriptor *Arg;
1225   const TargetRegisterClass *RC;
1226   std::tie(Arg, RC)
1227     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1228   if (!Arg)
1229     return false;
1230 
1231   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1232   if (!loadInputValue(KernargPtrReg, B, Arg))
1233     return false;
1234 
1235   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1236   MI.eraseFromParent();
1237   return true;
1238 }
1239 
1240 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1241                                             MachineRegisterInfo &MRI,
1242                                             MachineIRBuilder &B) const {
1243   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1244   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1245   case Intrinsic::amdgcn_if: {
1246     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1247       const SIRegisterInfo *TRI
1248         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1249 
1250       B.setInstr(*BrCond);
1251       Register Def = MI.getOperand(1).getReg();
1252       Register Use = MI.getOperand(3).getReg();
1253       B.buildInstr(AMDGPU::SI_IF)
1254         .addDef(Def)
1255         .addUse(Use)
1256         .addMBB(BrCond->getOperand(1).getMBB());
1257 
1258       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1259       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1260       MI.eraseFromParent();
1261       BrCond->eraseFromParent();
1262       return true;
1263     }
1264 
1265     return false;
1266   }
1267   case Intrinsic::amdgcn_loop: {
1268     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1269       const SIRegisterInfo *TRI
1270         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1271 
1272       B.setInstr(*BrCond);
1273       Register Reg = MI.getOperand(2).getReg();
1274       B.buildInstr(AMDGPU::SI_LOOP)
1275         .addUse(Reg)
1276         .addMBB(BrCond->getOperand(1).getMBB());
1277       MI.eraseFromParent();
1278       BrCond->eraseFromParent();
1279       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1280       return true;
1281     }
1282 
1283     return false;
1284   }
1285   case Intrinsic::amdgcn_kernarg_segment_ptr:
1286     return legalizePreloadedArgIntrin(
1287       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1288   case Intrinsic::amdgcn_implicitarg_ptr:
1289     return legalizeImplicitArgPtr(MI, MRI, B);
1290   case Intrinsic::amdgcn_workitem_id_x:
1291     return legalizePreloadedArgIntrin(MI, MRI, B,
1292                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1293   case Intrinsic::amdgcn_workitem_id_y:
1294     return legalizePreloadedArgIntrin(MI, MRI, B,
1295                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1296   case Intrinsic::amdgcn_workitem_id_z:
1297     return legalizePreloadedArgIntrin(MI, MRI, B,
1298                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1299   case Intrinsic::amdgcn_workgroup_id_x:
1300     return legalizePreloadedArgIntrin(MI, MRI, B,
1301                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1302   case Intrinsic::amdgcn_workgroup_id_y:
1303     return legalizePreloadedArgIntrin(MI, MRI, B,
1304                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1305   case Intrinsic::amdgcn_workgroup_id_z:
1306     return legalizePreloadedArgIntrin(MI, MRI, B,
1307                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1308   case Intrinsic::amdgcn_dispatch_ptr:
1309     return legalizePreloadedArgIntrin(MI, MRI, B,
1310                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1311   case Intrinsic::amdgcn_queue_ptr:
1312     return legalizePreloadedArgIntrin(MI, MRI, B,
1313                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1314   case Intrinsic::amdgcn_implicit_buffer_ptr:
1315     return legalizePreloadedArgIntrin(
1316       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1317   case Intrinsic::amdgcn_dispatch_id:
1318     return legalizePreloadedArgIntrin(MI, MRI, B,
1319                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1320   default:
1321     return true;
1322   }
1323 
1324   return true;
1325 }
1326