1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .clampScalar(0, S32, S256)
419     .widenScalarToNextPow2(0, 32)
420     .clampMaxNumElements(0, S32, 16)
421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422     .legalIf(isPointer(0));
423 
424   if (ST.hasVOP3PInsts()) {
425     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426       .legalFor({S32, S16, V2S16})
427       .clampScalar(0, S16, S32)
428       .clampMaxNumElements(0, S16, 2)
429       .scalarize(0)
430       .widenScalarToNextPow2(0, 32);
431   } else if (ST.has16BitInsts()) {
432     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
433       .legalFor({S32, S16})
434       .clampScalar(0, S16, S32)
435       .scalarize(0)
436       .widenScalarToNextPow2(0, 32);
437   } else {
438     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
439       .legalFor({S32})
440       .clampScalar(0, S32, S32)
441       .scalarize(0);
442   }
443 
444   // FIXME: Not really legal. Placeholder for custom lowering.
445   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
446     .customFor({S32, S64})
447     .clampScalar(0, S32, S64)
448     .widenScalarToNextPow2(0, 32)
449     .scalarize(0);
450 
451   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
452     .legalFor({S32})
453     .clampScalar(0, S32, S32)
454     .scalarize(0);
455 
456   // Report legal for any types we can handle anywhere. For the cases only legal
457   // on the SALU, RegBankSelect will be able to re-legalize.
458   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
459     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
460     .clampScalar(0, S32, S64)
461     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
462     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
463     .widenScalarToNextPow2(0)
464     .scalarize(0);
465 
466   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
467                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
468     .legalFor({{S32, S1}, {S32, S32}})
469     .minScalar(0, S32)
470     // TODO: .scalarize(0)
471     .lower();
472 
473   getActionDefinitionsBuilder(G_BITCAST)
474     // Don't worry about the size constraint.
475     .legalIf(all(isRegisterType(0), isRegisterType(1)))
476     .lower();
477 
478 
479   getActionDefinitionsBuilder(G_CONSTANT)
480     .legalFor({S1, S32, S64, S16, GlobalPtr,
481                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
482     .clampScalar(0, S32, S64)
483     .widenScalarToNextPow2(0)
484     .legalIf(isPointer(0));
485 
486   getActionDefinitionsBuilder(G_FCONSTANT)
487     .legalFor({S32, S64, S16})
488     .clampScalar(0, S16, S64);
489 
490   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
491       .legalIf(isRegisterType(0))
492       // s1 and s16 are special cases because they have legal operations on
493       // them, but don't really occupy registers in the normal way.
494       .legalFor({S1, S16})
495       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
496       .clampScalarOrElt(0, S32, MaxScalar)
497       .widenScalarToNextPow2(0, 32)
498       .clampMaxNumElements(0, S32, 16);
499 
500   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
501 
502   // If the amount is divergent, we have to do a wave reduction to get the
503   // maximum value, so this is expanded during RegBankSelect.
504   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
505     .legalFor({{PrivatePtr, S32}});
506 
507   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
508     .unsupportedFor({PrivatePtr})
509     .custom();
510   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
511 
512   auto &FPOpActions = getActionDefinitionsBuilder(
513     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
514     .legalFor({S32, S64});
515   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
516     .customFor({S32, S64});
517   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
518     .customFor({S32, S64});
519 
520   if (ST.has16BitInsts()) {
521     if (ST.hasVOP3PInsts())
522       FPOpActions.legalFor({S16, V2S16});
523     else
524       FPOpActions.legalFor({S16});
525 
526     TrigActions.customFor({S16});
527     FDIVActions.customFor({S16});
528   }
529 
530   auto &MinNumMaxNum = getActionDefinitionsBuilder({
531       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
532 
533   if (ST.hasVOP3PInsts()) {
534     MinNumMaxNum.customFor(FPTypesPK16)
535       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
536       .clampMaxNumElements(0, S16, 2)
537       .clampScalar(0, S16, S64)
538       .scalarize(0);
539   } else if (ST.has16BitInsts()) {
540     MinNumMaxNum.customFor(FPTypes16)
541       .clampScalar(0, S16, S64)
542       .scalarize(0);
543   } else {
544     MinNumMaxNum.customFor(FPTypesBase)
545       .clampScalar(0, S32, S64)
546       .scalarize(0);
547   }
548 
549   if (ST.hasVOP3PInsts())
550     FPOpActions.clampMaxNumElements(0, S16, 2);
551 
552   FPOpActions
553     .scalarize(0)
554     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
555 
556   TrigActions
557     .scalarize(0)
558     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
559 
560   FDIVActions
561     .scalarize(0)
562     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
563 
564   getActionDefinitionsBuilder({G_FNEG, G_FABS})
565     .legalFor(FPTypesPK16)
566     .clampMaxNumElements(0, S16, 2)
567     .scalarize(0)
568     .clampScalar(0, S16, S64);
569 
570   if (ST.has16BitInsts()) {
571     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
572       .legalFor({S32, S64, S16})
573       .scalarize(0)
574       .clampScalar(0, S16, S64);
575   } else {
576     getActionDefinitionsBuilder(G_FSQRT)
577       .legalFor({S32, S64})
578       .scalarize(0)
579       .clampScalar(0, S32, S64);
580 
581     if (ST.hasFractBug()) {
582       getActionDefinitionsBuilder(G_FFLOOR)
583         .customFor({S64})
584         .legalFor({S32, S64})
585         .scalarize(0)
586         .clampScalar(0, S32, S64);
587     } else {
588       getActionDefinitionsBuilder(G_FFLOOR)
589         .legalFor({S32, S64})
590         .scalarize(0)
591         .clampScalar(0, S32, S64);
592     }
593   }
594 
595   getActionDefinitionsBuilder(G_FPTRUNC)
596     .legalFor({{S32, S64}, {S16, S32}})
597     .scalarize(0)
598     .lower();
599 
600   getActionDefinitionsBuilder(G_FPEXT)
601     .legalFor({{S64, S32}, {S32, S16}})
602     .lowerFor({{S64, S16}}) // FIXME: Implement
603     .scalarize(0);
604 
605   getActionDefinitionsBuilder(G_FSUB)
606       // Use actual fsub instruction
607       .legalFor({S32})
608       // Must use fadd + fneg
609       .lowerFor({S64, S16, V2S16})
610       .scalarize(0)
611       .clampScalar(0, S32, S64);
612 
613   // Whether this is legal depends on the floating point mode for the function.
614   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
615   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
616     FMad.customFor({S32, S16});
617   else if (ST.hasMadMacF32Insts())
618     FMad.customFor({S32});
619   else if (ST.hasMadF16())
620     FMad.customFor({S16});
621   FMad.scalarize(0)
622       .lower();
623 
624   // TODO: Do we need to clamp maximum bitwidth?
625   getActionDefinitionsBuilder(G_TRUNC)
626     .legalIf(isScalar(0))
627     .legalFor({{V2S16, V2S32}})
628     .clampMaxNumElements(0, S16, 2)
629     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
630     // situations (like an invalid implicit use), we don't want to infinite loop
631     // in the legalizer.
632     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
633     .alwaysLegal();
634 
635   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
636     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
637                {S32, S1}, {S64, S1}, {S16, S1}})
638     .scalarize(0)
639     .clampScalar(0, S32, S64)
640     .widenScalarToNextPow2(1, 32);
641 
642   // TODO: Split s1->s64 during regbankselect for VALU.
643   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
644     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
645     .lowerFor({{S32, S64}})
646     .lowerIf(typeIs(1, S1))
647     .customFor({{S64, S64}});
648   if (ST.has16BitInsts())
649     IToFP.legalFor({{S16, S16}});
650   IToFP.clampScalar(1, S32, S64)
651        .scalarize(0)
652        .widenScalarToNextPow2(1);
653 
654   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
655     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
656     .customFor({{S64, S64}});
657   if (ST.has16BitInsts())
658     FPToI.legalFor({{S16, S16}});
659   else
660     FPToI.minScalar(1, S32);
661 
662   FPToI.minScalar(0, S32)
663        .scalarize(0)
664        .lower();
665 
666   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
667     .scalarize(0)
668     .lower();
669 
670   if (ST.has16BitInsts()) {
671     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
672       .legalFor({S16, S32, S64})
673       .clampScalar(0, S16, S64)
674       .scalarize(0);
675   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
676     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
677       .legalFor({S32, S64})
678       .clampScalar(0, S32, S64)
679       .scalarize(0);
680   } else {
681     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
682       .legalFor({S32})
683       .customFor({S64})
684       .clampScalar(0, S32, S64)
685       .scalarize(0);
686   }
687 
688   // FIXME: Clamp offset operand.
689   getActionDefinitionsBuilder(G_PTR_ADD)
690     .legalIf(isPointer(0))
691     .scalarize(0);
692 
693   getActionDefinitionsBuilder(G_PTRMASK)
694     .legalIf(typeInSet(1, {S64, S32}))
695     .minScalar(1, S32)
696     .maxScalarIf(sizeIs(0, 32), 1, S32)
697     .maxScalarIf(sizeIs(0, 64), 1, S64)
698     .scalarize(0);
699 
700   auto &CmpBuilder =
701     getActionDefinitionsBuilder(G_ICMP)
702     // The compare output type differs based on the register bank of the output,
703     // so make both s1 and s32 legal.
704     //
705     // Scalar compares producing output in scc will be promoted to s32, as that
706     // is the allocatable register type that will be needed for the copy from
707     // scc. This will be promoted during RegBankSelect, and we assume something
708     // before that won't try to use s32 result types.
709     //
710     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
711     // bank.
712     .legalForCartesianProduct(
713       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
714     .legalForCartesianProduct(
715       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
716   if (ST.has16BitInsts()) {
717     CmpBuilder.legalFor({{S1, S16}});
718   }
719 
720   CmpBuilder
721     .widenScalarToNextPow2(1)
722     .clampScalar(1, S32, S64)
723     .scalarize(0)
724     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
725 
726   getActionDefinitionsBuilder(G_FCMP)
727     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
728     .widenScalarToNextPow2(1)
729     .clampScalar(1, S32, S64)
730     .scalarize(0);
731 
732   // FIXME: fpow has a selection pattern that should move to custom lowering.
733   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
734   if (ST.has16BitInsts())
735     Exp2Ops.legalFor({S32, S16});
736   else
737     Exp2Ops.legalFor({S32});
738   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
739   Exp2Ops.scalarize(0);
740 
741   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
742   if (ST.has16BitInsts())
743     ExpOps.customFor({{S32}, {S16}});
744   else
745     ExpOps.customFor({S32});
746   ExpOps.clampScalar(0, MinScalarFPTy, S32)
747         .scalarize(0);
748 
749   // The 64-bit versions produce 32-bit results, but only on the SALU.
750   getActionDefinitionsBuilder(G_CTPOP)
751     .legalFor({{S32, S32}, {S32, S64}})
752     .clampScalar(0, S32, S32)
753     .clampScalar(1, S32, S64)
754     .scalarize(0)
755     .widenScalarToNextPow2(0, 32)
756     .widenScalarToNextPow2(1, 32);
757 
758   // The hardware instructions return a different result on 0 than the generic
759   // instructions expect. The hardware produces -1, but these produce the
760   // bitwidth.
761   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
762     .scalarize(0)
763     .clampScalar(0, S32, S32)
764     .clampScalar(1, S32, S64)
765     .widenScalarToNextPow2(0, 32)
766     .widenScalarToNextPow2(1, 32)
767     .lower();
768 
769   // The 64-bit versions produce 32-bit results, but only on the SALU.
770   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
771     .legalFor({{S32, S32}, {S32, S64}})
772     .clampScalar(0, S32, S32)
773     .clampScalar(1, S32, S64)
774     .scalarize(0)
775     .widenScalarToNextPow2(0, 32)
776     .widenScalarToNextPow2(1, 32);
777 
778   getActionDefinitionsBuilder(G_BITREVERSE)
779     .legalFor({S32})
780     .clampScalar(0, S32, S32)
781     .scalarize(0);
782 
783   if (ST.has16BitInsts()) {
784     getActionDefinitionsBuilder(G_BSWAP)
785       .legalFor({S16, S32, V2S16})
786       .clampMaxNumElements(0, S16, 2)
787       // FIXME: Fixing non-power-of-2 before clamp is workaround for
788       // narrowScalar limitation.
789       .widenScalarToNextPow2(0)
790       .clampScalar(0, S16, S32)
791       .scalarize(0);
792 
793     if (ST.hasVOP3PInsts()) {
794       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
795         .legalFor({S32, S16, V2S16})
796         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
797         .clampMaxNumElements(0, S16, 2)
798         .minScalar(0, S16)
799         .widenScalarToNextPow2(0)
800         .scalarize(0)
801         .lower();
802     } else {
803       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
804         .legalFor({S32, S16})
805         .widenScalarToNextPow2(0)
806         .minScalar(0, S16)
807         .scalarize(0)
808         .lower();
809     }
810   } else {
811     // TODO: Should have same legality without v_perm_b32
812     getActionDefinitionsBuilder(G_BSWAP)
813       .legalFor({S32})
814       .lowerIf(scalarNarrowerThan(0, 32))
815       // FIXME: Fixing non-power-of-2 before clamp is workaround for
816       // narrowScalar limitation.
817       .widenScalarToNextPow2(0)
818       .maxScalar(0, S32)
819       .scalarize(0)
820       .lower();
821 
822     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
823       .legalFor({S32})
824       .minScalar(0, S32)
825       .widenScalarToNextPow2(0)
826       .scalarize(0)
827       .lower();
828   }
829 
830   getActionDefinitionsBuilder(G_INTTOPTR)
831     // List the common cases
832     .legalForCartesianProduct(AddrSpaces64, {S64})
833     .legalForCartesianProduct(AddrSpaces32, {S32})
834     .scalarize(0)
835     // Accept any address space as long as the size matches
836     .legalIf(sameSize(0, 1))
837     .widenScalarIf(smallerThan(1, 0),
838       [](const LegalityQuery &Query) {
839         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
840       })
841     .narrowScalarIf(largerThan(1, 0),
842       [](const LegalityQuery &Query) {
843         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
844       });
845 
846   getActionDefinitionsBuilder(G_PTRTOINT)
847     // List the common cases
848     .legalForCartesianProduct(AddrSpaces64, {S64})
849     .legalForCartesianProduct(AddrSpaces32, {S32})
850     .scalarize(0)
851     // Accept any address space as long as the size matches
852     .legalIf(sameSize(0, 1))
853     .widenScalarIf(smallerThan(0, 1),
854       [](const LegalityQuery &Query) {
855         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
856       })
857     .narrowScalarIf(
858       largerThan(0, 1),
859       [](const LegalityQuery &Query) {
860         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
861       });
862 
863   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
864     .scalarize(0)
865     .custom();
866 
867   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
868                                     bool IsLoad) -> bool {
869     const LLT DstTy = Query.Types[0];
870 
871     // Split vector extloads.
872     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
873     unsigned Align = Query.MMODescrs[0].AlignInBits;
874 
875     if (MemSize < DstTy.getSizeInBits())
876       MemSize = std::max(MemSize, Align);
877 
878     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
879       return true;
880 
881     const LLT PtrTy = Query.Types[1];
882     unsigned AS = PtrTy.getAddressSpace();
883     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
884       return true;
885 
886     // Catch weird sized loads that don't evenly divide into the access sizes
887     // TODO: May be able to widen depending on alignment etc.
888     unsigned NumRegs = (MemSize + 31) / 32;
889     if (NumRegs == 3) {
890       if (!ST.hasDwordx3LoadStores())
891         return true;
892     } else {
893       // If the alignment allows, these should have been widened.
894       if (!isPowerOf2_32(NumRegs))
895         return true;
896     }
897 
898     if (Align < MemSize) {
899       const SITargetLowering *TLI = ST.getTargetLowering();
900       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
901     }
902 
903     return false;
904   };
905 
906   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
907                                          unsigned Opc) -> bool {
908     unsigned Size = Query.Types[0].getSizeInBits();
909     if (isPowerOf2_32(Size))
910       return false;
911 
912     if (Size == 96 && ST.hasDwordx3LoadStores())
913       return false;
914 
915     unsigned AddrSpace = Query.Types[1].getAddressSpace();
916     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
917       return false;
918 
919     unsigned Align = Query.MMODescrs[0].AlignInBits;
920     unsigned RoundedSize = NextPowerOf2(Size);
921     return (Align >= RoundedSize);
922   };
923 
924   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
925   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
926   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
927 
928   // TODO: Refine based on subtargets which support unaligned access or 128-bit
929   // LDS
930   // TODO: Unsupported flat for SI.
931 
932   for (unsigned Op : {G_LOAD, G_STORE}) {
933     const bool IsStore = Op == G_STORE;
934 
935     auto &Actions = getActionDefinitionsBuilder(Op);
936     // Explicitly list some common cases.
937     // TODO: Does this help compile time at all?
938     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
939                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
940                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
941                                       {S64, GlobalPtr, 64, GlobalAlign32},
942                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
943                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
944                                       {S32, GlobalPtr, 8, GlobalAlign8},
945                                       {S32, GlobalPtr, 16, GlobalAlign16},
946 
947                                       {S32, LocalPtr, 32, 32},
948                                       {S64, LocalPtr, 64, 32},
949                                       {V2S32, LocalPtr, 64, 32},
950                                       {S32, LocalPtr, 8, 8},
951                                       {S32, LocalPtr, 16, 16},
952                                       {V2S16, LocalPtr, 32, 32},
953 
954                                       {S32, PrivatePtr, 32, 32},
955                                       {S32, PrivatePtr, 8, 8},
956                                       {S32, PrivatePtr, 16, 16},
957                                       {V2S16, PrivatePtr, 32, 32},
958 
959                                       {S32, ConstantPtr, 32, GlobalAlign32},
960                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
961                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
962                                       {S64, ConstantPtr, 64, GlobalAlign32},
963                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
964     Actions.legalIf(
965       [=](const LegalityQuery &Query) -> bool {
966         return isLoadStoreLegal(ST, Query, Op);
967       });
968 
969     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
970     // 64-bits.
971     //
972     // TODO: Should generalize bitcast action into coerce, which will also cover
973     // inserting addrspacecasts.
974     Actions.customIf(typeIs(1, Constant32Ptr));
975 
976     // Turn any illegal element vectors into something easier to deal
977     // with. These will ultimately produce 32-bit scalar shifts to extract the
978     // parts anyway.
979     //
980     // For odd 16-bit element vectors, prefer to split those into pieces with
981     // 16-bit vector parts.
982     Actions.bitcastIf(
983       [=](const LegalityQuery &Query) -> bool {
984         const LLT Ty = Query.Types[0];
985         const unsigned Size = Ty.getSizeInBits();
986 
987         if (Size != Query.MMODescrs[0].SizeInBits)
988           return Size <= 32 && Ty.isVector();
989 
990         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
991           return true;
992         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
993                !isRegisterVectorElementType(Ty.getElementType());
994       }, bitcastToRegisterType(0));
995 
996     Actions
997         .customIf(typeIs(1, Constant32Ptr))
998         // Widen suitably aligned loads by loading extra elements.
999         .moreElementsIf([=](const LegalityQuery &Query) {
1000             const LLT Ty = Query.Types[0];
1001             return Op == G_LOAD && Ty.isVector() &&
1002                    shouldWidenLoadResult(Query, Op);
1003           }, moreElementsToNextPow2(0))
1004         .widenScalarIf([=](const LegalityQuery &Query) {
1005             const LLT Ty = Query.Types[0];
1006             return Op == G_LOAD && !Ty.isVector() &&
1007                    shouldWidenLoadResult(Query, Op);
1008           }, widenScalarOrEltToNextPow2(0))
1009         .narrowScalarIf(
1010             [=](const LegalityQuery &Query) -> bool {
1011               return !Query.Types[0].isVector() &&
1012                      needToSplitMemOp(Query, Op == G_LOAD);
1013             },
1014             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1015               const LLT DstTy = Query.Types[0];
1016               const LLT PtrTy = Query.Types[1];
1017 
1018               const unsigned DstSize = DstTy.getSizeInBits();
1019               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1020 
1021               // Split extloads.
1022               if (DstSize > MemSize)
1023                 return std::make_pair(0, LLT::scalar(MemSize));
1024 
1025               if (!isPowerOf2_32(DstSize)) {
1026                 // We're probably decomposing an odd sized store. Try to split
1027                 // to the widest type. TODO: Account for alignment. As-is it
1028                 // should be OK, since the new parts will be further legalized.
1029                 unsigned FloorSize = PowerOf2Floor(DstSize);
1030                 return std::make_pair(0, LLT::scalar(FloorSize));
1031               }
1032 
1033               if (DstSize > 32 && (DstSize % 32 != 0)) {
1034                 // FIXME: Need a way to specify non-extload of larger size if
1035                 // suitably aligned.
1036                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1037               }
1038 
1039               unsigned MaxSize = maxSizeForAddrSpace(ST,
1040                                                      PtrTy.getAddressSpace(),
1041                                                      Op == G_LOAD);
1042               if (MemSize > MaxSize)
1043                 return std::make_pair(0, LLT::scalar(MaxSize));
1044 
1045               unsigned Align = Query.MMODescrs[0].AlignInBits;
1046               return std::make_pair(0, LLT::scalar(Align));
1047             })
1048         .fewerElementsIf(
1049             [=](const LegalityQuery &Query) -> bool {
1050               return Query.Types[0].isVector() &&
1051                      needToSplitMemOp(Query, Op == G_LOAD);
1052             },
1053             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1054               const LLT DstTy = Query.Types[0];
1055               const LLT PtrTy = Query.Types[1];
1056 
1057               LLT EltTy = DstTy.getElementType();
1058               unsigned MaxSize = maxSizeForAddrSpace(ST,
1059                                                      PtrTy.getAddressSpace(),
1060                                                      Op == G_LOAD);
1061 
1062               // FIXME: Handle widened to power of 2 results better. This ends
1063               // up scalarizing.
1064               // FIXME: 3 element stores scalarized on SI
1065 
1066               // Split if it's too large for the address space.
1067               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1068                 unsigned NumElts = DstTy.getNumElements();
1069                 unsigned EltSize = EltTy.getSizeInBits();
1070 
1071                 if (MaxSize % EltSize == 0) {
1072                   return std::make_pair(
1073                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1074                 }
1075 
1076                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1077 
1078                 // FIXME: Refine when odd breakdowns handled
1079                 // The scalars will need to be re-legalized.
1080                 if (NumPieces == 1 || NumPieces >= NumElts ||
1081                     NumElts % NumPieces != 0)
1082                   return std::make_pair(0, EltTy);
1083 
1084                 return std::make_pair(0,
1085                                       LLT::vector(NumElts / NumPieces, EltTy));
1086               }
1087 
1088               // FIXME: We could probably handle weird extending loads better.
1089               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1090               if (DstTy.getSizeInBits() > MemSize)
1091                 return std::make_pair(0, EltTy);
1092 
1093               unsigned EltSize = EltTy.getSizeInBits();
1094               unsigned DstSize = DstTy.getSizeInBits();
1095               if (!isPowerOf2_32(DstSize)) {
1096                 // We're probably decomposing an odd sized store. Try to split
1097                 // to the widest type. TODO: Account for alignment. As-is it
1098                 // should be OK, since the new parts will be further legalized.
1099                 unsigned FloorSize = PowerOf2Floor(DstSize);
1100                 return std::make_pair(
1101                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1102               }
1103 
1104               // Need to split because of alignment.
1105               unsigned Align = Query.MMODescrs[0].AlignInBits;
1106               if (EltSize > Align &&
1107                   (EltSize / Align < DstTy.getNumElements())) {
1108                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1109               }
1110 
1111               // May need relegalization for the scalars.
1112               return std::make_pair(0, EltTy);
1113             })
1114         .minScalar(0, S32);
1115 
1116     if (IsStore)
1117       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1118 
1119     // TODO: Need a bitcast lower option?
1120     Actions
1121         .widenScalarToNextPow2(0)
1122         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1123   }
1124 
1125   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1126                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1127                                                   {S32, GlobalPtr, 16, 2 * 8},
1128                                                   {S32, LocalPtr, 8, 8},
1129                                                   {S32, LocalPtr, 16, 16},
1130                                                   {S32, PrivatePtr, 8, 8},
1131                                                   {S32, PrivatePtr, 16, 16},
1132                                                   {S32, ConstantPtr, 8, 8},
1133                                                   {S32, ConstantPtr, 16, 2 * 8}});
1134   if (ST.hasFlatAddressSpace()) {
1135     ExtLoads.legalForTypesWithMemDesc(
1136         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1137   }
1138 
1139   ExtLoads.clampScalar(0, S32, S32)
1140           .widenScalarToNextPow2(0)
1141           .unsupportedIfMemSizeNotPow2()
1142           .lower();
1143 
1144   auto &Atomics = getActionDefinitionsBuilder(
1145     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1146      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1147      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1148      G_ATOMICRMW_UMIN})
1149     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1150                {S64, GlobalPtr}, {S64, LocalPtr}});
1151   if (ST.hasFlatAddressSpace()) {
1152     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1153   }
1154 
1155   if (ST.hasLDSFPAtomics()) {
1156     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1157       .legalFor({{S32, LocalPtr}});
1158   }
1159 
1160   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1161   // demarshalling
1162   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1163     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1164                 {S32, FlatPtr}, {S64, FlatPtr}})
1165     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1166                {S32, RegionPtr}, {S64, RegionPtr}});
1167   // TODO: Pointer types, any 32-bit or 64-bit vector
1168 
1169   // Condition should be s32 for scalar, s1 for vector.
1170   getActionDefinitionsBuilder(G_SELECT)
1171     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1172           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1173           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1174     .clampScalar(0, S16, S64)
1175     .scalarize(1)
1176     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1177     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1178     .clampMaxNumElements(0, S32, 2)
1179     .clampMaxNumElements(0, LocalPtr, 2)
1180     .clampMaxNumElements(0, PrivatePtr, 2)
1181     .scalarize(0)
1182     .widenScalarToNextPow2(0)
1183     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1184 
1185   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1186   // be more flexible with the shift amount type.
1187   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1188     .legalFor({{S32, S32}, {S64, S32}});
1189   if (ST.has16BitInsts()) {
1190     if (ST.hasVOP3PInsts()) {
1191       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1192             .clampMaxNumElements(0, S16, 2);
1193     } else
1194       Shifts.legalFor({{S16, S16}});
1195 
1196     // TODO: Support 16-bit shift amounts for all types
1197     Shifts.widenScalarIf(
1198       [=](const LegalityQuery &Query) {
1199         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1200         // 32-bit amount.
1201         const LLT ValTy = Query.Types[0];
1202         const LLT AmountTy = Query.Types[1];
1203         return ValTy.getSizeInBits() <= 16 &&
1204                AmountTy.getSizeInBits() < 16;
1205       }, changeTo(1, S16));
1206     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1207     Shifts.clampScalar(1, S32, S32);
1208     Shifts.clampScalar(0, S16, S64);
1209     Shifts.widenScalarToNextPow2(0, 16);
1210   } else {
1211     // Make sure we legalize the shift amount type first, as the general
1212     // expansion for the shifted type will produce much worse code if it hasn't
1213     // been truncated already.
1214     Shifts.clampScalar(1, S32, S32);
1215     Shifts.clampScalar(0, S32, S64);
1216     Shifts.widenScalarToNextPow2(0, 32);
1217   }
1218   Shifts.scalarize(0);
1219 
1220   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1221     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1222     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1223     unsigned IdxTypeIdx = 2;
1224 
1225     getActionDefinitionsBuilder(Op)
1226       .customIf([=](const LegalityQuery &Query) {
1227           const LLT EltTy = Query.Types[EltTypeIdx];
1228           const LLT VecTy = Query.Types[VecTypeIdx];
1229           const LLT IdxTy = Query.Types[IdxTypeIdx];
1230           return (EltTy.getSizeInBits() == 16 ||
1231                   EltTy.getSizeInBits() % 32 == 0) &&
1232                  VecTy.getSizeInBits() % 32 == 0 &&
1233                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1234                  IdxTy.getSizeInBits() == 32;
1235         })
1236       .clampScalar(EltTypeIdx, S32, S64)
1237       .clampScalar(VecTypeIdx, S32, S64)
1238       .clampScalar(IdxTypeIdx, S32, S32);
1239   }
1240 
1241   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1242     .unsupportedIf([=](const LegalityQuery &Query) {
1243         const LLT &EltTy = Query.Types[1].getElementType();
1244         return Query.Types[0] != EltTy;
1245       });
1246 
1247   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1248     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1249     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1250 
1251     // FIXME: Doesn't handle extract of illegal sizes.
1252     getActionDefinitionsBuilder(Op)
1253       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1254       // FIXME: Multiples of 16 should not be legal.
1255       .legalIf([=](const LegalityQuery &Query) {
1256           const LLT BigTy = Query.Types[BigTyIdx];
1257           const LLT LitTy = Query.Types[LitTyIdx];
1258           return (BigTy.getSizeInBits() % 32 == 0) &&
1259                  (LitTy.getSizeInBits() % 16 == 0);
1260         })
1261       .widenScalarIf(
1262         [=](const LegalityQuery &Query) {
1263           const LLT BigTy = Query.Types[BigTyIdx];
1264           return (BigTy.getScalarSizeInBits() < 16);
1265         },
1266         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1267       .widenScalarIf(
1268         [=](const LegalityQuery &Query) {
1269           const LLT LitTy = Query.Types[LitTyIdx];
1270           return (LitTy.getScalarSizeInBits() < 16);
1271         },
1272         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1273       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1274       .widenScalarToNextPow2(BigTyIdx, 32);
1275 
1276   }
1277 
1278   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1279     .legalForCartesianProduct(AllS32Vectors, {S32})
1280     .legalForCartesianProduct(AllS64Vectors, {S64})
1281     .clampNumElements(0, V16S32, V32S32)
1282     .clampNumElements(0, V2S64, V16S64)
1283     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1284 
1285   if (ST.hasScalarPackInsts()) {
1286     BuildVector
1287       // FIXME: Should probably widen s1 vectors straight to s32
1288       .minScalarOrElt(0, S16)
1289       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1290       .minScalar(1, S32);
1291 
1292     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1293       .legalFor({V2S16, S32})
1294       .lower();
1295     BuildVector.minScalarOrElt(0, S32);
1296   } else {
1297     BuildVector.customFor({V2S16, S16});
1298     BuildVector.minScalarOrElt(0, S32);
1299 
1300     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1301       .customFor({V2S16, S32})
1302       .lower();
1303   }
1304 
1305   BuildVector.legalIf(isRegisterType(0));
1306 
1307   // FIXME: Clamp maximum size
1308   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1309     .legalIf(isRegisterType(0));
1310 
1311   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1312   // pre-legalize.
1313   if (ST.hasVOP3PInsts()) {
1314     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1315       .customFor({V2S16, V2S16})
1316       .lower();
1317   } else
1318     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1319 
1320   // Merge/Unmerge
1321   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1322     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1323     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1324 
1325     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1326       const LLT Ty = Query.Types[TypeIdx];
1327       if (Ty.isVector()) {
1328         const LLT &EltTy = Ty.getElementType();
1329         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1330           return true;
1331         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1332           return true;
1333       }
1334       return false;
1335     };
1336 
1337     auto &Builder = getActionDefinitionsBuilder(Op)
1338       .lowerFor({{S16, V2S16}})
1339       .lowerIf([=](const LegalityQuery &Query) {
1340           const LLT BigTy = Query.Types[BigTyIdx];
1341           return BigTy.getSizeInBits() == 32;
1342         })
1343       // Try to widen to s16 first for small types.
1344       // TODO: Only do this on targets with legal s16 shifts
1345       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1346       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1347       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1348       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1349                            elementTypeIs(1, S16)),
1350                        changeTo(1, V2S16))
1351       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1352       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1353       // valid.
1354       .clampScalar(LitTyIdx, S32, S512)
1355       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1356       // Break up vectors with weird elements into scalars
1357       .fewerElementsIf(
1358         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1359         scalarize(0))
1360       .fewerElementsIf(
1361         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1362         scalarize(1))
1363       .clampScalar(BigTyIdx, S32, MaxScalar);
1364 
1365     if (Op == G_MERGE_VALUES) {
1366       Builder.widenScalarIf(
1367         // TODO: Use 16-bit shifts if legal for 8-bit values?
1368         [=](const LegalityQuery &Query) {
1369           const LLT Ty = Query.Types[LitTyIdx];
1370           return Ty.getSizeInBits() < 32;
1371         },
1372         changeTo(LitTyIdx, S32));
1373     }
1374 
1375     Builder.widenScalarIf(
1376       [=](const LegalityQuery &Query) {
1377         const LLT Ty = Query.Types[BigTyIdx];
1378         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1379           Ty.getSizeInBits() % 16 != 0;
1380       },
1381       [=](const LegalityQuery &Query) {
1382         // Pick the next power of 2, or a multiple of 64 over 128.
1383         // Whichever is smaller.
1384         const LLT &Ty = Query.Types[BigTyIdx];
1385         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1386         if (NewSizeInBits >= 256) {
1387           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1388           if (RoundedTo < NewSizeInBits)
1389             NewSizeInBits = RoundedTo;
1390         }
1391         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1392       })
1393       .legalIf([=](const LegalityQuery &Query) {
1394           const LLT &BigTy = Query.Types[BigTyIdx];
1395           const LLT &LitTy = Query.Types[LitTyIdx];
1396 
1397           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1398             return false;
1399           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1400             return false;
1401 
1402           return BigTy.getSizeInBits() % 16 == 0 &&
1403                  LitTy.getSizeInBits() % 16 == 0 &&
1404                  BigTy.getSizeInBits() <= MaxRegisterSize;
1405         })
1406       // Any vectors left are the wrong size. Scalarize them.
1407       .scalarize(0)
1408       .scalarize(1);
1409   }
1410 
1411   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1412   // RegBankSelect.
1413   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1414     .legalFor({{S32}, {S64}});
1415 
1416   if (ST.hasVOP3PInsts()) {
1417     SextInReg.lowerFor({{V2S16}})
1418       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1419       // get more vector shift opportunities, since we'll get those when
1420       // expanded.
1421       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1422   } else if (ST.has16BitInsts()) {
1423     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1424   } else {
1425     // Prefer to promote to s32 before lowering if we don't have 16-bit
1426     // shifts. This avoid a lot of intermediate truncate and extend operations.
1427     SextInReg.lowerFor({{S32}, {S64}});
1428   }
1429 
1430   SextInReg
1431     .scalarize(0)
1432     .clampScalar(0, S32, S64)
1433     .lower();
1434 
1435   getActionDefinitionsBuilder(G_FSHR)
1436     .legalFor({{S32, S32}})
1437     .scalarize(0)
1438     .lower();
1439 
1440   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1441     .legalFor({S64});
1442 
1443   getActionDefinitionsBuilder({
1444       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1445       G_FCOPYSIGN,
1446 
1447       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1448       G_READ_REGISTER,
1449       G_WRITE_REGISTER,
1450 
1451       G_SADDO, G_SSUBO,
1452 
1453        // TODO: Implement
1454       G_FMINIMUM, G_FMAXIMUM,
1455       G_FSHL
1456     }).lower();
1457 
1458   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1459         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1460         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1461     .unsupported();
1462 
1463   computeTables();
1464   verify(*ST.getInstrInfo());
1465 }
1466 
1467 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1468                                          MachineInstr &MI) const {
1469   MachineIRBuilder &B = Helper.MIRBuilder;
1470   MachineRegisterInfo &MRI = *B.getMRI();
1471   GISelChangeObserver &Observer = Helper.Observer;
1472 
1473   switch (MI.getOpcode()) {
1474   case TargetOpcode::G_ADDRSPACE_CAST:
1475     return legalizeAddrSpaceCast(MI, MRI, B);
1476   case TargetOpcode::G_FRINT:
1477     return legalizeFrint(MI, MRI, B);
1478   case TargetOpcode::G_FCEIL:
1479     return legalizeFceil(MI, MRI, B);
1480   case TargetOpcode::G_INTRINSIC_TRUNC:
1481     return legalizeIntrinsicTrunc(MI, MRI, B);
1482   case TargetOpcode::G_SITOFP:
1483     return legalizeITOFP(MI, MRI, B, true);
1484   case TargetOpcode::G_UITOFP:
1485     return legalizeITOFP(MI, MRI, B, false);
1486   case TargetOpcode::G_FPTOSI:
1487     return legalizeFPTOI(MI, MRI, B, true);
1488   case TargetOpcode::G_FPTOUI:
1489     return legalizeFPTOI(MI, MRI, B, false);
1490   case TargetOpcode::G_FMINNUM:
1491   case TargetOpcode::G_FMAXNUM:
1492   case TargetOpcode::G_FMINNUM_IEEE:
1493   case TargetOpcode::G_FMAXNUM_IEEE:
1494     return legalizeMinNumMaxNum(Helper, MI);
1495   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1496     return legalizeExtractVectorElt(MI, MRI, B);
1497   case TargetOpcode::G_INSERT_VECTOR_ELT:
1498     return legalizeInsertVectorElt(MI, MRI, B);
1499   case TargetOpcode::G_SHUFFLE_VECTOR:
1500     return legalizeShuffleVector(MI, MRI, B);
1501   case TargetOpcode::G_FSIN:
1502   case TargetOpcode::G_FCOS:
1503     return legalizeSinCos(MI, MRI, B);
1504   case TargetOpcode::G_GLOBAL_VALUE:
1505     return legalizeGlobalValue(MI, MRI, B);
1506   case TargetOpcode::G_LOAD:
1507     return legalizeLoad(MI, MRI, B, Observer);
1508   case TargetOpcode::G_FMAD:
1509     return legalizeFMad(MI, MRI, B);
1510   case TargetOpcode::G_FDIV:
1511     return legalizeFDIV(MI, MRI, B);
1512   case TargetOpcode::G_UDIV:
1513   case TargetOpcode::G_UREM:
1514     return legalizeUDIV_UREM(MI, MRI, B);
1515   case TargetOpcode::G_SDIV:
1516   case TargetOpcode::G_SREM:
1517     return legalizeSDIV_SREM(MI, MRI, B);
1518   case TargetOpcode::G_ATOMIC_CMPXCHG:
1519     return legalizeAtomicCmpXChg(MI, MRI, B);
1520   case TargetOpcode::G_FLOG:
1521     return legalizeFlog(MI, B, numbers::ln2f);
1522   case TargetOpcode::G_FLOG10:
1523     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1524   case TargetOpcode::G_FEXP:
1525     return legalizeFExp(MI, B);
1526   case TargetOpcode::G_FPOW:
1527     return legalizeFPow(MI, B);
1528   case TargetOpcode::G_FFLOOR:
1529     return legalizeFFloor(MI, MRI, B);
1530   case TargetOpcode::G_BUILD_VECTOR:
1531     return legalizeBuildVector(MI, MRI, B);
1532   default:
1533     return false;
1534   }
1535 
1536   llvm_unreachable("expected switch to return");
1537 }
1538 
1539 Register AMDGPULegalizerInfo::getSegmentAperture(
1540   unsigned AS,
1541   MachineRegisterInfo &MRI,
1542   MachineIRBuilder &B) const {
1543   MachineFunction &MF = B.getMF();
1544   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1545   const LLT S32 = LLT::scalar(32);
1546 
1547   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1548 
1549   if (ST.hasApertureRegs()) {
1550     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1551     // getreg.
1552     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1553         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1554         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1555     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1556         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1557         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1558     unsigned Encoding =
1559         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1560         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1561         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1562 
1563     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1564 
1565     B.buildInstr(AMDGPU::S_GETREG_B32)
1566       .addDef(GetReg)
1567       .addImm(Encoding);
1568     MRI.setType(GetReg, S32);
1569 
1570     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1571     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1572   }
1573 
1574   Register QueuePtr = MRI.createGenericVirtualRegister(
1575     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1576 
1577   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1578   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1579     return Register();
1580 
1581   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1582   // private_segment_aperture_base_hi.
1583   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1584 
1585   // TODO: can we be smarter about machine pointer info?
1586   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1587   MachineMemOperand *MMO = MF.getMachineMemOperand(
1588       PtrInfo,
1589       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1590           MachineMemOperand::MOInvariant,
1591       4, commonAlignment(Align(64), StructOffset));
1592 
1593   Register LoadAddr;
1594 
1595   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1596   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1597 }
1598 
1599 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1600   MachineInstr &MI, MachineRegisterInfo &MRI,
1601   MachineIRBuilder &B) const {
1602   MachineFunction &MF = B.getMF();
1603 
1604   const LLT S32 = LLT::scalar(32);
1605   Register Dst = MI.getOperand(0).getReg();
1606   Register Src = MI.getOperand(1).getReg();
1607 
1608   LLT DstTy = MRI.getType(Dst);
1609   LLT SrcTy = MRI.getType(Src);
1610   unsigned DestAS = DstTy.getAddressSpace();
1611   unsigned SrcAS = SrcTy.getAddressSpace();
1612 
1613   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1614   // vector element.
1615   assert(!DstTy.isVector());
1616 
1617   const AMDGPUTargetMachine &TM
1618     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1619 
1620   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1621   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1622     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1623     return true;
1624   }
1625 
1626   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1627     // Truncate.
1628     B.buildExtract(Dst, Src, 0);
1629     MI.eraseFromParent();
1630     return true;
1631   }
1632 
1633   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1634     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1635     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1636 
1637     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1638     // another. Merge operands are required to be the same type, but creating an
1639     // extra ptrtoint would be kind of pointless.
1640     auto HighAddr = B.buildConstant(
1641       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1642     B.buildMerge(Dst, {Src, HighAddr});
1643     MI.eraseFromParent();
1644     return true;
1645   }
1646 
1647   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1648     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1649            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1650     unsigned NullVal = TM.getNullPointerValue(DestAS);
1651 
1652     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1653     auto FlatNull = B.buildConstant(SrcTy, 0);
1654 
1655     // Extract low 32-bits of the pointer.
1656     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1657 
1658     auto CmpRes =
1659         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1660     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1661 
1662     MI.eraseFromParent();
1663     return true;
1664   }
1665 
1666   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1667     return false;
1668 
1669   if (!ST.hasFlatAddressSpace())
1670     return false;
1671 
1672   auto SegmentNull =
1673       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1674   auto FlatNull =
1675       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1676 
1677   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1678   if (!ApertureReg.isValid())
1679     return false;
1680 
1681   auto CmpRes =
1682       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1683 
1684   // Coerce the type of the low half of the result so we can use merge_values.
1685   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1686 
1687   // TODO: Should we allow mismatched types but matching sizes in merges to
1688   // avoid the ptrtoint?
1689   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1690   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1691 
1692   MI.eraseFromParent();
1693   return true;
1694 }
1695 
1696 bool AMDGPULegalizerInfo::legalizeFrint(
1697   MachineInstr &MI, MachineRegisterInfo &MRI,
1698   MachineIRBuilder &B) const {
1699   Register Src = MI.getOperand(1).getReg();
1700   LLT Ty = MRI.getType(Src);
1701   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1702 
1703   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1704   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1705 
1706   auto C1 = B.buildFConstant(Ty, C1Val);
1707   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1708 
1709   // TODO: Should this propagate fast-math-flags?
1710   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1711   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1712 
1713   auto C2 = B.buildFConstant(Ty, C2Val);
1714   auto Fabs = B.buildFAbs(Ty, Src);
1715 
1716   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1717   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1718   return true;
1719 }
1720 
1721 bool AMDGPULegalizerInfo::legalizeFceil(
1722   MachineInstr &MI, MachineRegisterInfo &MRI,
1723   MachineIRBuilder &B) const {
1724 
1725   const LLT S1 = LLT::scalar(1);
1726   const LLT S64 = LLT::scalar(64);
1727 
1728   Register Src = MI.getOperand(1).getReg();
1729   assert(MRI.getType(Src) == S64);
1730 
1731   // result = trunc(src)
1732   // if (src > 0.0 && src != result)
1733   //   result += 1.0
1734 
1735   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1736 
1737   const auto Zero = B.buildFConstant(S64, 0.0);
1738   const auto One = B.buildFConstant(S64, 1.0);
1739   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1740   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1741   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1742   auto Add = B.buildSelect(S64, And, One, Zero);
1743 
1744   // TODO: Should this propagate fast-math-flags?
1745   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1746   return true;
1747 }
1748 
1749 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1750                                               MachineIRBuilder &B) {
1751   const unsigned FractBits = 52;
1752   const unsigned ExpBits = 11;
1753   LLT S32 = LLT::scalar(32);
1754 
1755   auto Const0 = B.buildConstant(S32, FractBits - 32);
1756   auto Const1 = B.buildConstant(S32, ExpBits);
1757 
1758   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1759     .addUse(Const0.getReg(0))
1760     .addUse(Const1.getReg(0));
1761 
1762   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1763 }
1764 
1765 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1766   MachineInstr &MI, MachineRegisterInfo &MRI,
1767   MachineIRBuilder &B) const {
1768   const LLT S1 = LLT::scalar(1);
1769   const LLT S32 = LLT::scalar(32);
1770   const LLT S64 = LLT::scalar(64);
1771 
1772   Register Src = MI.getOperand(1).getReg();
1773   assert(MRI.getType(Src) == S64);
1774 
1775   // TODO: Should this use extract since the low half is unused?
1776   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1777   Register Hi = Unmerge.getReg(1);
1778 
1779   // Extract the upper half, since this is where we will find the sign and
1780   // exponent.
1781   auto Exp = extractF64Exponent(Hi, B);
1782 
1783   const unsigned FractBits = 52;
1784 
1785   // Extract the sign bit.
1786   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1787   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1788 
1789   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1790 
1791   const auto Zero32 = B.buildConstant(S32, 0);
1792 
1793   // Extend back to 64-bits.
1794   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1795 
1796   auto Shr = B.buildAShr(S64, FractMask, Exp);
1797   auto Not = B.buildNot(S64, Shr);
1798   auto Tmp0 = B.buildAnd(S64, Src, Not);
1799   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1800 
1801   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1802   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1803 
1804   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1805   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1806   return true;
1807 }
1808 
1809 bool AMDGPULegalizerInfo::legalizeITOFP(
1810   MachineInstr &MI, MachineRegisterInfo &MRI,
1811   MachineIRBuilder &B, bool Signed) const {
1812 
1813   Register Dst = MI.getOperand(0).getReg();
1814   Register Src = MI.getOperand(1).getReg();
1815 
1816   const LLT S64 = LLT::scalar(64);
1817   const LLT S32 = LLT::scalar(32);
1818 
1819   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1820 
1821   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1822 
1823   auto CvtHi = Signed ?
1824     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1825     B.buildUITOFP(S64, Unmerge.getReg(1));
1826 
1827   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1828 
1829   auto ThirtyTwo = B.buildConstant(S32, 32);
1830   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1831     .addUse(CvtHi.getReg(0))
1832     .addUse(ThirtyTwo.getReg(0));
1833 
1834   // TODO: Should this propagate fast-math-flags?
1835   B.buildFAdd(Dst, LdExp, CvtLo);
1836   MI.eraseFromParent();
1837   return true;
1838 }
1839 
1840 // TODO: Copied from DAG implementation. Verify logic and document how this
1841 // actually works.
1842 bool AMDGPULegalizerInfo::legalizeFPTOI(
1843   MachineInstr &MI, MachineRegisterInfo &MRI,
1844   MachineIRBuilder &B, bool Signed) const {
1845 
1846   Register Dst = MI.getOperand(0).getReg();
1847   Register Src = MI.getOperand(1).getReg();
1848 
1849   const LLT S64 = LLT::scalar(64);
1850   const LLT S32 = LLT::scalar(32);
1851 
1852   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1853 
1854   unsigned Flags = MI.getFlags();
1855 
1856   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1857   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1858   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1859 
1860   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1861   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1862   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1863 
1864   auto Hi = Signed ?
1865     B.buildFPTOSI(S32, FloorMul) :
1866     B.buildFPTOUI(S32, FloorMul);
1867   auto Lo = B.buildFPTOUI(S32, Fma);
1868 
1869   B.buildMerge(Dst, { Lo, Hi });
1870   MI.eraseFromParent();
1871 
1872   return true;
1873 }
1874 
1875 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1876                                                MachineInstr &MI) const {
1877   MachineFunction &MF = Helper.MIRBuilder.getMF();
1878   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1879 
1880   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1881                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1882 
1883   // With ieee_mode disabled, the instructions have the correct behavior
1884   // already for G_FMINNUM/G_FMAXNUM
1885   if (!MFI->getMode().IEEE)
1886     return !IsIEEEOp;
1887 
1888   if (IsIEEEOp)
1889     return true;
1890 
1891   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1892 }
1893 
1894 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1895   MachineInstr &MI, MachineRegisterInfo &MRI,
1896   MachineIRBuilder &B) const {
1897   // TODO: Should move some of this into LegalizerHelper.
1898 
1899   // TODO: Promote dynamic indexing of s16 to s32
1900 
1901   // FIXME: Artifact combiner probably should have replaced the truncated
1902   // constant before this, so we shouldn't need
1903   // getConstantVRegValWithLookThrough.
1904   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1905     MI.getOperand(2).getReg(), MRI);
1906   if (!IdxVal) // Dynamic case will be selected to register indexing.
1907     return true;
1908 
1909   Register Dst = MI.getOperand(0).getReg();
1910   Register Vec = MI.getOperand(1).getReg();
1911 
1912   LLT VecTy = MRI.getType(Vec);
1913   LLT EltTy = VecTy.getElementType();
1914   assert(EltTy == MRI.getType(Dst));
1915 
1916   if (IdxVal->Value < VecTy.getNumElements())
1917     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1918   else
1919     B.buildUndef(Dst);
1920 
1921   MI.eraseFromParent();
1922   return true;
1923 }
1924 
1925 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1926   MachineInstr &MI, MachineRegisterInfo &MRI,
1927   MachineIRBuilder &B) const {
1928   // TODO: Should move some of this into LegalizerHelper.
1929 
1930   // TODO: Promote dynamic indexing of s16 to s32
1931 
1932   // FIXME: Artifact combiner probably should have replaced the truncated
1933   // constant before this, so we shouldn't need
1934   // getConstantVRegValWithLookThrough.
1935   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1936     MI.getOperand(3).getReg(), MRI);
1937   if (!IdxVal) // Dynamic case will be selected to register indexing.
1938     return true;
1939 
1940   Register Dst = MI.getOperand(0).getReg();
1941   Register Vec = MI.getOperand(1).getReg();
1942   Register Ins = MI.getOperand(2).getReg();
1943 
1944   LLT VecTy = MRI.getType(Vec);
1945   LLT EltTy = VecTy.getElementType();
1946   assert(EltTy == MRI.getType(Ins));
1947 
1948   if (IdxVal->Value < VecTy.getNumElements())
1949     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1950   else
1951     B.buildUndef(Dst);
1952 
1953   MI.eraseFromParent();
1954   return true;
1955 }
1956 
1957 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1958   MachineInstr &MI, MachineRegisterInfo &MRI,
1959   MachineIRBuilder &B) const {
1960   const LLT V2S16 = LLT::vector(2, 16);
1961 
1962   Register Dst = MI.getOperand(0).getReg();
1963   Register Src0 = MI.getOperand(1).getReg();
1964   LLT DstTy = MRI.getType(Dst);
1965   LLT SrcTy = MRI.getType(Src0);
1966 
1967   if (SrcTy == V2S16 && DstTy == V2S16 &&
1968       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1969     return true;
1970 
1971   MachineIRBuilder HelperBuilder(MI);
1972   GISelObserverWrapper DummyObserver;
1973   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1974   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1975 }
1976 
1977 bool AMDGPULegalizerInfo::legalizeSinCos(
1978   MachineInstr &MI, MachineRegisterInfo &MRI,
1979   MachineIRBuilder &B) const {
1980 
1981   Register DstReg = MI.getOperand(0).getReg();
1982   Register SrcReg = MI.getOperand(1).getReg();
1983   LLT Ty = MRI.getType(DstReg);
1984   unsigned Flags = MI.getFlags();
1985 
1986   Register TrigVal;
1987   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1988   if (ST.hasTrigReducedRange()) {
1989     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1990     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1991       .addUse(MulVal.getReg(0))
1992       .setMIFlags(Flags).getReg(0);
1993   } else
1994     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1995 
1996   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1997     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1998   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1999     .addUse(TrigVal)
2000     .setMIFlags(Flags);
2001   MI.eraseFromParent();
2002   return true;
2003 }
2004 
2005 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2006                                                   MachineIRBuilder &B,
2007                                                   const GlobalValue *GV,
2008                                                   int64_t Offset,
2009                                                   unsigned GAFlags) const {
2010   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2011   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2012   // to the following code sequence:
2013   //
2014   // For constant address space:
2015   //   s_getpc_b64 s[0:1]
2016   //   s_add_u32 s0, s0, $symbol
2017   //   s_addc_u32 s1, s1, 0
2018   //
2019   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2020   //   a fixup or relocation is emitted to replace $symbol with a literal
2021   //   constant, which is a pc-relative offset from the encoding of the $symbol
2022   //   operand to the global variable.
2023   //
2024   // For global address space:
2025   //   s_getpc_b64 s[0:1]
2026   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2027   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2028   //
2029   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2030   //   fixups or relocations are emitted to replace $symbol@*@lo and
2031   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2032   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2033   //   operand to the global variable.
2034   //
2035   // What we want here is an offset from the value returned by s_getpc
2036   // (which is the address of the s_add_u32 instruction) to the global
2037   // variable, but since the encoding of $symbol starts 4 bytes after the start
2038   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2039   // small. This requires us to add 4 to the global variable offset in order to
2040   // compute the correct address.
2041 
2042   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2043 
2044   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2045     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2046 
2047   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2048     .addDef(PCReg);
2049 
2050   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2051   if (GAFlags == SIInstrInfo::MO_NONE)
2052     MIB.addImm(0);
2053   else
2054     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2055 
2056   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2057 
2058   if (PtrTy.getSizeInBits() == 32)
2059     B.buildExtract(DstReg, PCReg, 0);
2060   return true;
2061  }
2062 
2063 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2064   MachineInstr &MI, MachineRegisterInfo &MRI,
2065   MachineIRBuilder &B) const {
2066   Register DstReg = MI.getOperand(0).getReg();
2067   LLT Ty = MRI.getType(DstReg);
2068   unsigned AS = Ty.getAddressSpace();
2069 
2070   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2071   MachineFunction &MF = B.getMF();
2072   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2073 
2074   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2075     if (!MFI->isEntryFunction()) {
2076       const Function &Fn = MF.getFunction();
2077       DiagnosticInfoUnsupported BadLDSDecl(
2078         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2079         DS_Warning);
2080       Fn.getContext().diagnose(BadLDSDecl);
2081 
2082       // We currently don't have a way to correctly allocate LDS objects that
2083       // aren't directly associated with a kernel. We do force inlining of
2084       // functions that use local objects. However, if these dead functions are
2085       // not eliminated, we don't want a compile time error. Just emit a warning
2086       // and a trap, since there should be no callable path here.
2087       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2088       B.buildUndef(DstReg);
2089       MI.eraseFromParent();
2090       return true;
2091     }
2092 
2093     // TODO: We could emit code to handle the initialization somewhere.
2094     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2095       const SITargetLowering *TLI = ST.getTargetLowering();
2096       if (!TLI->shouldUseLDSConstAddress(GV)) {
2097         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2098         return true; // Leave in place;
2099       }
2100 
2101       B.buildConstant(
2102           DstReg,
2103           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2104       MI.eraseFromParent();
2105       return true;
2106     }
2107 
2108     const Function &Fn = MF.getFunction();
2109     DiagnosticInfoUnsupported BadInit(
2110       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2111     Fn.getContext().diagnose(BadInit);
2112     return true;
2113   }
2114 
2115   const SITargetLowering *TLI = ST.getTargetLowering();
2116 
2117   if (TLI->shouldEmitFixup(GV)) {
2118     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2119     MI.eraseFromParent();
2120     return true;
2121   }
2122 
2123   if (TLI->shouldEmitPCReloc(GV)) {
2124     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2125     MI.eraseFromParent();
2126     return true;
2127   }
2128 
2129   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2130   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2131 
2132   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2133       MachinePointerInfo::getGOT(MF),
2134       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2135           MachineMemOperand::MOInvariant,
2136       8 /*Size*/, Align(8));
2137 
2138   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2139 
2140   if (Ty.getSizeInBits() == 32) {
2141     // Truncate if this is a 32-bit constant adrdess.
2142     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2143     B.buildExtract(DstReg, Load, 0);
2144   } else
2145     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2146 
2147   MI.eraseFromParent();
2148   return true;
2149 }
2150 
2151 bool AMDGPULegalizerInfo::legalizeLoad(
2152   MachineInstr &MI, MachineRegisterInfo &MRI,
2153   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2154   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2155   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2156   Observer.changingInstr(MI);
2157   MI.getOperand(1).setReg(Cast.getReg(0));
2158   Observer.changedInstr(MI);
2159   return true;
2160 }
2161 
2162 bool AMDGPULegalizerInfo::legalizeFMad(
2163   MachineInstr &MI, MachineRegisterInfo &MRI,
2164   MachineIRBuilder &B) const {
2165   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2166   assert(Ty.isScalar());
2167 
2168   MachineFunction &MF = B.getMF();
2169   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2170 
2171   // TODO: Always legal with future ftz flag.
2172   // FIXME: Do we need just output?
2173   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2174     return true;
2175   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2176     return true;
2177 
2178   MachineIRBuilder HelperBuilder(MI);
2179   GISelObserverWrapper DummyObserver;
2180   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2181   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2182 }
2183 
2184 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2185   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2186   Register DstReg = MI.getOperand(0).getReg();
2187   Register PtrReg = MI.getOperand(1).getReg();
2188   Register CmpVal = MI.getOperand(2).getReg();
2189   Register NewVal = MI.getOperand(3).getReg();
2190 
2191   assert(SITargetLowering::isFlatGlobalAddrSpace(
2192            MRI.getType(PtrReg).getAddressSpace()) &&
2193          "this should not have been custom lowered");
2194 
2195   LLT ValTy = MRI.getType(CmpVal);
2196   LLT VecTy = LLT::vector(2, ValTy);
2197 
2198   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2199 
2200   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2201     .addDef(DstReg)
2202     .addUse(PtrReg)
2203     .addUse(PackedVal)
2204     .setMemRefs(MI.memoperands());
2205 
2206   MI.eraseFromParent();
2207   return true;
2208 }
2209 
2210 bool AMDGPULegalizerInfo::legalizeFlog(
2211   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2212   Register Dst = MI.getOperand(0).getReg();
2213   Register Src = MI.getOperand(1).getReg();
2214   LLT Ty = B.getMRI()->getType(Dst);
2215   unsigned Flags = MI.getFlags();
2216 
2217   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2218   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2219 
2220   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2221   MI.eraseFromParent();
2222   return true;
2223 }
2224 
2225 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2226                                        MachineIRBuilder &B) const {
2227   Register Dst = MI.getOperand(0).getReg();
2228   Register Src = MI.getOperand(1).getReg();
2229   unsigned Flags = MI.getFlags();
2230   LLT Ty = B.getMRI()->getType(Dst);
2231 
2232   auto K = B.buildFConstant(Ty, numbers::log2e);
2233   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2234   B.buildFExp2(Dst, Mul, Flags);
2235   MI.eraseFromParent();
2236   return true;
2237 }
2238 
2239 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2240                                        MachineIRBuilder &B) const {
2241   Register Dst = MI.getOperand(0).getReg();
2242   Register Src0 = MI.getOperand(1).getReg();
2243   Register Src1 = MI.getOperand(2).getReg();
2244   unsigned Flags = MI.getFlags();
2245   LLT Ty = B.getMRI()->getType(Dst);
2246   const LLT S16 = LLT::scalar(16);
2247   const LLT S32 = LLT::scalar(32);
2248 
2249   if (Ty == S32) {
2250     auto Log = B.buildFLog2(S32, Src0, Flags);
2251     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2252       .addUse(Log.getReg(0))
2253       .addUse(Src1)
2254       .setMIFlags(Flags);
2255     B.buildFExp2(Dst, Mul, Flags);
2256   } else if (Ty == S16) {
2257     // There's no f16 fmul_legacy, so we need to convert for it.
2258     auto Log = B.buildFLog2(S16, Src0, Flags);
2259     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2260     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2261     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2262       .addUse(Ext0.getReg(0))
2263       .addUse(Ext1.getReg(0))
2264       .setMIFlags(Flags);
2265 
2266     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2267   } else
2268     return false;
2269 
2270   MI.eraseFromParent();
2271   return true;
2272 }
2273 
2274 // Find a source register, ignoring any possible source modifiers.
2275 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2276   Register ModSrc = OrigSrc;
2277   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2278     ModSrc = SrcFNeg->getOperand(1).getReg();
2279     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2280       ModSrc = SrcFAbs->getOperand(1).getReg();
2281   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2282     ModSrc = SrcFAbs->getOperand(1).getReg();
2283   return ModSrc;
2284 }
2285 
2286 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2287                                          MachineRegisterInfo &MRI,
2288                                          MachineIRBuilder &B) const {
2289 
2290   const LLT S1 = LLT::scalar(1);
2291   const LLT S64 = LLT::scalar(64);
2292   Register Dst = MI.getOperand(0).getReg();
2293   Register OrigSrc = MI.getOperand(1).getReg();
2294   unsigned Flags = MI.getFlags();
2295   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2296          "this should not have been custom lowered");
2297 
2298   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2299   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2300   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2301   // V_FRACT bug is:
2302   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2303   //
2304   // Convert floor(x) to (x - fract(x))
2305 
2306   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2307     .addUse(OrigSrc)
2308     .setMIFlags(Flags);
2309 
2310   // Give source modifier matching some assistance before obscuring a foldable
2311   // pattern.
2312 
2313   // TODO: We can avoid the neg on the fract? The input sign to fract
2314   // shouldn't matter?
2315   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2316 
2317   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2318 
2319   Register Min = MRI.createGenericVirtualRegister(S64);
2320 
2321   // We don't need to concern ourselves with the snan handling difference, so
2322   // use the one which will directly select.
2323   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2324   if (MFI->getMode().IEEE)
2325     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2326   else
2327     B.buildFMinNum(Min, Fract, Const, Flags);
2328 
2329   Register CorrectedFract = Min;
2330   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2331     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2332     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2333   }
2334 
2335   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2336   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2337 
2338   MI.eraseFromParent();
2339   return true;
2340 }
2341 
2342 // Turn an illegal packed v2s16 build vector into bit operations.
2343 // TODO: This should probably be a bitcast action in LegalizerHelper.
2344 bool AMDGPULegalizerInfo::legalizeBuildVector(
2345   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2346   Register Dst = MI.getOperand(0).getReg();
2347   const LLT S32 = LLT::scalar(32);
2348   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2349 
2350   Register Src0 = MI.getOperand(1).getReg();
2351   Register Src1 = MI.getOperand(2).getReg();
2352   assert(MRI.getType(Src0) == LLT::scalar(16));
2353 
2354   auto Merge = B.buildMerge(S32, {Src0, Src1});
2355   B.buildBitcast(Dst, Merge);
2356 
2357   MI.eraseFromParent();
2358   return true;
2359 }
2360 
2361 // Return the use branch instruction, otherwise null if the usage is invalid.
2362 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2363                                        MachineRegisterInfo &MRI,
2364                                        MachineInstr *&Br,
2365                                        MachineBasicBlock *&UncondBrTarget) {
2366   Register CondDef = MI.getOperand(0).getReg();
2367   if (!MRI.hasOneNonDBGUse(CondDef))
2368     return nullptr;
2369 
2370   MachineBasicBlock *Parent = MI.getParent();
2371   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2372   if (UseMI.getParent() != Parent ||
2373       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2374     return nullptr;
2375 
2376   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2377   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2378   if (Next == Parent->end()) {
2379     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2380     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2381       return nullptr;
2382     UncondBrTarget = &*NextMBB;
2383   } else {
2384     if (Next->getOpcode() != AMDGPU::G_BR)
2385       return nullptr;
2386     Br = &*Next;
2387     UncondBrTarget = Br->getOperand(0).getMBB();
2388   }
2389 
2390   return &UseMI;
2391 }
2392 
2393 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2394                                                MachineRegisterInfo &MRI,
2395                                                Register LiveIn,
2396                                                Register PhyReg) const {
2397   assert(PhyReg.isPhysical() && "Physical register expected");
2398 
2399   // Insert the live-in copy, if required, by defining destination virtual
2400   // register.
2401   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2402   if (!MRI.getVRegDef(LiveIn)) {
2403     // FIXME: Should have scoped insert pt
2404     MachineBasicBlock &OrigInsBB = B.getMBB();
2405     auto OrigInsPt = B.getInsertPt();
2406 
2407     MachineBasicBlock &EntryMBB = B.getMF().front();
2408     EntryMBB.addLiveIn(PhyReg);
2409     B.setInsertPt(EntryMBB, EntryMBB.begin());
2410     B.buildCopy(LiveIn, PhyReg);
2411 
2412     B.setInsertPt(OrigInsBB, OrigInsPt);
2413   }
2414 
2415   return LiveIn;
2416 }
2417 
2418 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2419                                                 MachineRegisterInfo &MRI,
2420                                                 Register PhyReg, LLT Ty,
2421                                                 bool InsertLiveInCopy) const {
2422   assert(PhyReg.isPhysical() && "Physical register expected");
2423 
2424   // Get or create virtual live-in regester
2425   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2426   if (!LiveIn) {
2427     LiveIn = MRI.createGenericVirtualRegister(Ty);
2428     MRI.addLiveIn(PhyReg, LiveIn);
2429   }
2430 
2431   // When the actual true copy required is from virtual register to physical
2432   // register (to be inserted later), live-in copy insertion from physical
2433   // to register virtual register is not required
2434   if (!InsertLiveInCopy)
2435     return LiveIn;
2436 
2437   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2438 }
2439 
2440 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2441     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2442   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2443   const ArgDescriptor *Arg;
2444   const TargetRegisterClass *RC;
2445   LLT ArgTy;
2446   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2447   if (!Arg) {
2448     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2449     return nullptr;
2450   }
2451   return Arg;
2452 }
2453 
2454 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2455                                          const ArgDescriptor *Arg) const {
2456   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2457     return false; // TODO: Handle these
2458 
2459   Register SrcReg = Arg->getRegister();
2460   assert(SrcReg.isPhysical() && "Physical register expected");
2461   assert(DstReg.isVirtual() && "Virtual register expected");
2462 
2463   MachineRegisterInfo &MRI = *B.getMRI();
2464 
2465   LLT Ty = MRI.getType(DstReg);
2466   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2467 
2468   if (Arg->isMasked()) {
2469     // TODO: Should we try to emit this once in the entry block?
2470     const LLT S32 = LLT::scalar(32);
2471     const unsigned Mask = Arg->getMask();
2472     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2473 
2474     Register AndMaskSrc = LiveIn;
2475 
2476     if (Shift != 0) {
2477       auto ShiftAmt = B.buildConstant(S32, Shift);
2478       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2479     }
2480 
2481     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2482   } else {
2483     B.buildCopy(DstReg, LiveIn);
2484   }
2485 
2486   return true;
2487 }
2488 
2489 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2490     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2491     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2492 
2493   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2494   if (!Arg)
2495     return false;
2496 
2497   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2498     return false;
2499 
2500   MI.eraseFromParent();
2501   return true;
2502 }
2503 
2504 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2505                                        MachineRegisterInfo &MRI,
2506                                        MachineIRBuilder &B) const {
2507   Register Dst = MI.getOperand(0).getReg();
2508   LLT DstTy = MRI.getType(Dst);
2509   LLT S16 = LLT::scalar(16);
2510   LLT S32 = LLT::scalar(32);
2511   LLT S64 = LLT::scalar(64);
2512 
2513   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2514     return true;
2515 
2516   if (DstTy == S16)
2517     return legalizeFDIV16(MI, MRI, B);
2518   if (DstTy == S32)
2519     return legalizeFDIV32(MI, MRI, B);
2520   if (DstTy == S64)
2521     return legalizeFDIV64(MI, MRI, B);
2522 
2523   return false;
2524 }
2525 
2526 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2527                                                   Register DstReg,
2528                                                   Register X,
2529                                                   Register Y,
2530                                                   bool IsDiv) const {
2531   const LLT S1 = LLT::scalar(1);
2532   const LLT S32 = LLT::scalar(32);
2533 
2534   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2535   // algorithm used here.
2536 
2537   // Initial estimate of inv(y).
2538   auto FloatY = B.buildUITOFP(S32, Y);
2539   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2540   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2541   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2542   auto Z = B.buildFPTOUI(S32, ScaledY);
2543 
2544   // One round of UNR.
2545   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2546   auto NegYZ = B.buildMul(S32, NegY, Z);
2547   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2548 
2549   // Quotient/remainder estimate.
2550   auto Q = B.buildUMulH(S32, X, Z);
2551   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2552 
2553   // First quotient/remainder refinement.
2554   auto One = B.buildConstant(S32, 1);
2555   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2556   if (IsDiv)
2557     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2558   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2559 
2560   // Second quotient/remainder refinement.
2561   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2562   if (IsDiv)
2563     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2564   else
2565     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2566 }
2567 
2568 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2569                                               MachineRegisterInfo &MRI,
2570                                               MachineIRBuilder &B) const {
2571   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2572   Register DstReg = MI.getOperand(0).getReg();
2573   Register Num = MI.getOperand(1).getReg();
2574   Register Den = MI.getOperand(2).getReg();
2575   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2576   MI.eraseFromParent();
2577   return true;
2578 }
2579 
2580 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2581 //
2582 // Return lo, hi of result
2583 //
2584 // %cvt.lo = G_UITOFP Val.lo
2585 // %cvt.hi = G_UITOFP Val.hi
2586 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2587 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2588 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2589 // %mul2 = G_FMUL %mul1, 2**(-32)
2590 // %trunc = G_INTRINSIC_TRUNC %mul2
2591 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2592 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2593 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2594                                                        Register Val) {
2595   const LLT S32 = LLT::scalar(32);
2596   auto Unmerge = B.buildUnmerge(S32, Val);
2597 
2598   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2599   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2600 
2601   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2602                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2603 
2604   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2605   auto Mul1 =
2606       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2607 
2608   // 2**(-32)
2609   auto Mul2 =
2610       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2611   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2612 
2613   // -(2**32)
2614   auto Mad2 = B.buildFMAD(S32, Trunc,
2615                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2616 
2617   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2618   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2619 
2620   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2621 }
2622 
2623 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2624                                                   Register DstReg,
2625                                                   Register Numer,
2626                                                   Register Denom,
2627                                                   bool IsDiv) const {
2628   const LLT S32 = LLT::scalar(32);
2629   const LLT S64 = LLT::scalar(64);
2630   const LLT S1 = LLT::scalar(1);
2631   Register RcpLo, RcpHi;
2632 
2633   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2634 
2635   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2636 
2637   auto Zero64 = B.buildConstant(S64, 0);
2638   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2639 
2640   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2641   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2642 
2643   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2644   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2645   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2646 
2647   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2648   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2649   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2650   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2651 
2652   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2653   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2654   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2655   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2656   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2657 
2658   auto Zero32 = B.buildConstant(S32, 0);
2659   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2660   auto Add2_HiC =
2661       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2662   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2663   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2664 
2665   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2666   Register NumerLo = UnmergeNumer.getReg(0);
2667   Register NumerHi = UnmergeNumer.getReg(1);
2668 
2669   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2670   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2671   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2672   Register Mul3_Lo = UnmergeMul3.getReg(0);
2673   Register Mul3_Hi = UnmergeMul3.getReg(1);
2674   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2675   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2676   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2677   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2678 
2679   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2680   Register DenomLo = UnmergeDenom.getReg(0);
2681   Register DenomHi = UnmergeDenom.getReg(1);
2682 
2683   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2684   auto C1 = B.buildSExt(S32, CmpHi);
2685 
2686   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2687   auto C2 = B.buildSExt(S32, CmpLo);
2688 
2689   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2690   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2691 
2692   // TODO: Here and below portions of the code can be enclosed into if/endif.
2693   // Currently control flow is unconditional and we have 4 selects after
2694   // potential endif to substitute PHIs.
2695 
2696   // if C3 != 0 ...
2697   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2698   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2699   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2700   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2701 
2702   auto One64 = B.buildConstant(S64, 1);
2703   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2704 
2705   auto C4 =
2706       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2707   auto C5 =
2708       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2709   auto C6 = B.buildSelect(
2710       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2711 
2712   // if (C6 != 0)
2713   auto Add4 = B.buildAdd(S64, Add3, One64);
2714   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2715 
2716   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2717   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2718   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2719 
2720   // endif C6
2721   // endif C3
2722 
2723   if (IsDiv) {
2724     auto Sel1 = B.buildSelect(
2725         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2726     B.buildSelect(DstReg,
2727                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2728   } else {
2729     auto Sel2 = B.buildSelect(
2730         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2731     B.buildSelect(DstReg,
2732                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2733   }
2734 }
2735 
2736 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2737                                             MachineRegisterInfo &MRI,
2738                                             MachineIRBuilder &B) const {
2739   const LLT S64 = LLT::scalar(64);
2740   const LLT S32 = LLT::scalar(32);
2741   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2742   Register DstReg = MI.getOperand(0).getReg();
2743   Register Num = MI.getOperand(1).getReg();
2744   Register Den = MI.getOperand(2).getReg();
2745   LLT Ty = MRI.getType(DstReg);
2746 
2747   if (Ty == S32)
2748     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2749   else if (Ty == S64)
2750     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2751   else
2752     return false;
2753 
2754   MI.eraseFromParent();
2755   return true;
2756 
2757 }
2758 
2759 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2760                                             MachineRegisterInfo &MRI,
2761                                             MachineIRBuilder &B) const {
2762   const LLT S64 = LLT::scalar(64);
2763   const LLT S32 = LLT::scalar(32);
2764 
2765   Register DstReg = MI.getOperand(0).getReg();
2766   const LLT Ty = MRI.getType(DstReg);
2767   if (Ty != S32 && Ty != S64)
2768     return false;
2769 
2770   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2771 
2772   Register LHS = MI.getOperand(1).getReg();
2773   Register RHS = MI.getOperand(2).getReg();
2774 
2775   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2776   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2777   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2778 
2779   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2780   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2781 
2782   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2783   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2784 
2785   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2786   if (Ty == S32)
2787     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2788   else
2789     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2790 
2791   Register Sign;
2792   if (IsDiv)
2793     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2794   else
2795     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2796 
2797   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2798   B.buildSub(DstReg, UDivRem, Sign);
2799 
2800   MI.eraseFromParent();
2801   return true;
2802 }
2803 
2804 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2805                                                  MachineRegisterInfo &MRI,
2806                                                  MachineIRBuilder &B) const {
2807   Register Res = MI.getOperand(0).getReg();
2808   Register LHS = MI.getOperand(1).getReg();
2809   Register RHS = MI.getOperand(2).getReg();
2810 
2811   uint16_t Flags = MI.getFlags();
2812 
2813   LLT ResTy = MRI.getType(Res);
2814   LLT S32 = LLT::scalar(32);
2815   LLT S64 = LLT::scalar(64);
2816 
2817   const MachineFunction &MF = B.getMF();
2818   bool Unsafe =
2819     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2820 
2821   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2822     return false;
2823 
2824   if (!Unsafe && ResTy == S32 &&
2825       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2826     return false;
2827 
2828   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2829     // 1 / x -> RCP(x)
2830     if (CLHS->isExactlyValue(1.0)) {
2831       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2832         .addUse(RHS)
2833         .setMIFlags(Flags);
2834 
2835       MI.eraseFromParent();
2836       return true;
2837     }
2838 
2839     // -1 / x -> RCP( FNEG(x) )
2840     if (CLHS->isExactlyValue(-1.0)) {
2841       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2842       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2843         .addUse(FNeg.getReg(0))
2844         .setMIFlags(Flags);
2845 
2846       MI.eraseFromParent();
2847       return true;
2848     }
2849   }
2850 
2851   // x / y -> x * (1.0 / y)
2852   if (Unsafe) {
2853     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2854       .addUse(RHS)
2855       .setMIFlags(Flags);
2856     B.buildFMul(Res, LHS, RCP, Flags);
2857 
2858     MI.eraseFromParent();
2859     return true;
2860   }
2861 
2862   return false;
2863 }
2864 
2865 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2866                                          MachineRegisterInfo &MRI,
2867                                          MachineIRBuilder &B) const {
2868   Register Res = MI.getOperand(0).getReg();
2869   Register LHS = MI.getOperand(1).getReg();
2870   Register RHS = MI.getOperand(2).getReg();
2871 
2872   uint16_t Flags = MI.getFlags();
2873 
2874   LLT S16 = LLT::scalar(16);
2875   LLT S32 = LLT::scalar(32);
2876 
2877   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2878   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2879 
2880   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2881     .addUse(RHSExt.getReg(0))
2882     .setMIFlags(Flags);
2883 
2884   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2885   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2886 
2887   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2888     .addUse(RDst.getReg(0))
2889     .addUse(RHS)
2890     .addUse(LHS)
2891     .setMIFlags(Flags);
2892 
2893   MI.eraseFromParent();
2894   return true;
2895 }
2896 
2897 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2898 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2899 static void toggleSPDenormMode(bool Enable,
2900                                MachineIRBuilder &B,
2901                                const GCNSubtarget &ST,
2902                                AMDGPU::SIModeRegisterDefaults Mode) {
2903   // Set SP denorm mode to this value.
2904   unsigned SPDenormMode =
2905     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2906 
2907   if (ST.hasDenormModeInst()) {
2908     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2909     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2910 
2911     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2912     B.buildInstr(AMDGPU::S_DENORM_MODE)
2913       .addImm(NewDenormModeValue);
2914 
2915   } else {
2916     // Select FP32 bit field in mode register.
2917     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2918                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2919                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2920 
2921     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2922       .addImm(SPDenormMode)
2923       .addImm(SPDenormModeBitField);
2924   }
2925 }
2926 
2927 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2928                                          MachineRegisterInfo &MRI,
2929                                          MachineIRBuilder &B) const {
2930   Register Res = MI.getOperand(0).getReg();
2931   Register LHS = MI.getOperand(1).getReg();
2932   Register RHS = MI.getOperand(2).getReg();
2933   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2934   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2935 
2936   uint16_t Flags = MI.getFlags();
2937 
2938   LLT S32 = LLT::scalar(32);
2939   LLT S1 = LLT::scalar(1);
2940 
2941   auto One = B.buildFConstant(S32, 1.0f);
2942 
2943   auto DenominatorScaled =
2944     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2945       .addUse(LHS)
2946       .addUse(RHS)
2947       .addImm(0)
2948       .setMIFlags(Flags);
2949   auto NumeratorScaled =
2950     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2951       .addUse(LHS)
2952       .addUse(RHS)
2953       .addImm(1)
2954       .setMIFlags(Flags);
2955 
2956   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2957     .addUse(DenominatorScaled.getReg(0))
2958     .setMIFlags(Flags);
2959   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2960 
2961   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2962   // aren't modeled as reading it.
2963   if (!Mode.allFP32Denormals())
2964     toggleSPDenormMode(true, B, ST, Mode);
2965 
2966   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2967   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2968   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2969   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2970   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2971   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2972 
2973   if (!Mode.allFP32Denormals())
2974     toggleSPDenormMode(false, B, ST, Mode);
2975 
2976   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2977     .addUse(Fma4.getReg(0))
2978     .addUse(Fma1.getReg(0))
2979     .addUse(Fma3.getReg(0))
2980     .addUse(NumeratorScaled.getReg(1))
2981     .setMIFlags(Flags);
2982 
2983   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2984     .addUse(Fmas.getReg(0))
2985     .addUse(RHS)
2986     .addUse(LHS)
2987     .setMIFlags(Flags);
2988 
2989   MI.eraseFromParent();
2990   return true;
2991 }
2992 
2993 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2994                                          MachineRegisterInfo &MRI,
2995                                          MachineIRBuilder &B) const {
2996   Register Res = MI.getOperand(0).getReg();
2997   Register LHS = MI.getOperand(1).getReg();
2998   Register RHS = MI.getOperand(2).getReg();
2999 
3000   uint16_t Flags = MI.getFlags();
3001 
3002   LLT S64 = LLT::scalar(64);
3003   LLT S1 = LLT::scalar(1);
3004 
3005   auto One = B.buildFConstant(S64, 1.0);
3006 
3007   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3008     .addUse(LHS)
3009     .addUse(RHS)
3010     .addImm(0)
3011     .setMIFlags(Flags);
3012 
3013   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3014 
3015   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3016     .addUse(DivScale0.getReg(0))
3017     .setMIFlags(Flags);
3018 
3019   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3020   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3021   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3022 
3023   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3024     .addUse(LHS)
3025     .addUse(RHS)
3026     .addImm(1)
3027     .setMIFlags(Flags);
3028 
3029   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3030   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3031   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3032 
3033   Register Scale;
3034   if (!ST.hasUsableDivScaleConditionOutput()) {
3035     // Workaround a hardware bug on SI where the condition output from div_scale
3036     // is not usable.
3037 
3038     LLT S32 = LLT::scalar(32);
3039 
3040     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3041     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3042     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3043     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3044 
3045     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3046                               Scale1Unmerge.getReg(1));
3047     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3048                               Scale0Unmerge.getReg(1));
3049     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3050   } else {
3051     Scale = DivScale1.getReg(1);
3052   }
3053 
3054   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3055     .addUse(Fma4.getReg(0))
3056     .addUse(Fma3.getReg(0))
3057     .addUse(Mul.getReg(0))
3058     .addUse(Scale)
3059     .setMIFlags(Flags);
3060 
3061   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3062     .addUse(Fmas.getReg(0))
3063     .addUse(RHS)
3064     .addUse(LHS)
3065     .setMIFlags(Flags);
3066 
3067   MI.eraseFromParent();
3068   return true;
3069 }
3070 
3071 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3072                                                  MachineRegisterInfo &MRI,
3073                                                  MachineIRBuilder &B) const {
3074   Register Res = MI.getOperand(0).getReg();
3075   Register LHS = MI.getOperand(2).getReg();
3076   Register RHS = MI.getOperand(3).getReg();
3077   uint16_t Flags = MI.getFlags();
3078 
3079   LLT S32 = LLT::scalar(32);
3080   LLT S1 = LLT::scalar(1);
3081 
3082   auto Abs = B.buildFAbs(S32, RHS, Flags);
3083   const APFloat C0Val(1.0f);
3084 
3085   auto C0 = B.buildConstant(S32, 0x6f800000);
3086   auto C1 = B.buildConstant(S32, 0x2f800000);
3087   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3088 
3089   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3090   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3091 
3092   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3093 
3094   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3095     .addUse(Mul0.getReg(0))
3096     .setMIFlags(Flags);
3097 
3098   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3099 
3100   B.buildFMul(Res, Sel, Mul1, Flags);
3101 
3102   MI.eraseFromParent();
3103   return true;
3104 }
3105 
3106 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3107                                                  MachineRegisterInfo &MRI,
3108                                                  MachineIRBuilder &B) const {
3109   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3110   if (!MFI->isEntryFunction()) {
3111     return legalizePreloadedArgIntrin(MI, MRI, B,
3112                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3113   }
3114 
3115   uint64_t Offset =
3116     ST.getTargetLowering()->getImplicitParameterOffset(
3117       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3118   Register DstReg = MI.getOperand(0).getReg();
3119   LLT DstTy = MRI.getType(DstReg);
3120   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3121 
3122   const ArgDescriptor *Arg;
3123   const TargetRegisterClass *RC;
3124   LLT ArgTy;
3125   std::tie(Arg, RC, ArgTy) =
3126       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3127   if (!Arg)
3128     return false;
3129 
3130   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3131   if (!loadInputValue(KernargPtrReg, B, Arg))
3132     return false;
3133 
3134   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3135   MI.eraseFromParent();
3136   return true;
3137 }
3138 
3139 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3140                                               MachineRegisterInfo &MRI,
3141                                               MachineIRBuilder &B,
3142                                               unsigned AddrSpace) const {
3143   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3144   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3145   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3146   MI.eraseFromParent();
3147   return true;
3148 }
3149 
3150 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3151 // offset (the offset that is included in bounds checking and swizzling, to be
3152 // split between the instruction's voffset and immoffset fields) and soffset
3153 // (the offset that is excluded from bounds checking and swizzling, to go in
3154 // the instruction's soffset field).  This function takes the first kind of
3155 // offset and figures out how to split it between voffset and immoffset.
3156 std::tuple<Register, unsigned, unsigned>
3157 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3158                                         Register OrigOffset) const {
3159   const unsigned MaxImm = 4095;
3160   Register BaseReg;
3161   unsigned TotalConstOffset;
3162   MachineInstr *OffsetDef;
3163   const LLT S32 = LLT::scalar(32);
3164 
3165   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3166     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3167 
3168   unsigned ImmOffset = TotalConstOffset;
3169 
3170   // If the immediate value is too big for the immoffset field, put the value
3171   // and -4096 into the immoffset field so that the value that is copied/added
3172   // for the voffset field is a multiple of 4096, and it stands more chance
3173   // of being CSEd with the copy/add for another similar load/store.
3174   // However, do not do that rounding down to a multiple of 4096 if that is a
3175   // negative number, as it appears to be illegal to have a negative offset
3176   // in the vgpr, even if adding the immediate offset makes it positive.
3177   unsigned Overflow = ImmOffset & ~MaxImm;
3178   ImmOffset -= Overflow;
3179   if ((int32_t)Overflow < 0) {
3180     Overflow += ImmOffset;
3181     ImmOffset = 0;
3182   }
3183 
3184   if (Overflow != 0) {
3185     if (!BaseReg) {
3186       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3187     } else {
3188       auto OverflowVal = B.buildConstant(S32, Overflow);
3189       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3190     }
3191   }
3192 
3193   if (!BaseReg)
3194     BaseReg = B.buildConstant(S32, 0).getReg(0);
3195 
3196   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3197 }
3198 
3199 /// Handle register layout difference for f16 images for some subtargets.
3200 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3201                                              MachineRegisterInfo &MRI,
3202                                              Register Reg) const {
3203   if (!ST.hasUnpackedD16VMem())
3204     return Reg;
3205 
3206   const LLT S16 = LLT::scalar(16);
3207   const LLT S32 = LLT::scalar(32);
3208   LLT StoreVT = MRI.getType(Reg);
3209   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3210 
3211   auto Unmerge = B.buildUnmerge(S16, Reg);
3212 
3213   SmallVector<Register, 4> WideRegs;
3214   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3215     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3216 
3217   int NumElts = StoreVT.getNumElements();
3218 
3219   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3220 }
3221 
3222 Register AMDGPULegalizerInfo::fixStoreSourceType(
3223   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3224   MachineRegisterInfo *MRI = B.getMRI();
3225   LLT Ty = MRI->getType(VData);
3226 
3227   const LLT S16 = LLT::scalar(16);
3228 
3229   // Fixup illegal register types for i8 stores.
3230   if (Ty == LLT::scalar(8) || Ty == S16) {
3231     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3232     return AnyExt;
3233   }
3234 
3235   if (Ty.isVector()) {
3236     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3237       if (IsFormat)
3238         return handleD16VData(B, *MRI, VData);
3239     }
3240   }
3241 
3242   return VData;
3243 }
3244 
3245 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3246                                               MachineRegisterInfo &MRI,
3247                                               MachineIRBuilder &B,
3248                                               bool IsTyped,
3249                                               bool IsFormat) const {
3250   Register VData = MI.getOperand(1).getReg();
3251   LLT Ty = MRI.getType(VData);
3252   LLT EltTy = Ty.getScalarType();
3253   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3254   const LLT S32 = LLT::scalar(32);
3255 
3256   VData = fixStoreSourceType(B, VData, IsFormat);
3257   Register RSrc = MI.getOperand(2).getReg();
3258 
3259   MachineMemOperand *MMO = *MI.memoperands_begin();
3260   const int MemSize = MMO->getSize();
3261 
3262   unsigned ImmOffset;
3263   unsigned TotalOffset;
3264 
3265   // The typed intrinsics add an immediate after the registers.
3266   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3267 
3268   // The struct intrinsic variants add one additional operand over raw.
3269   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3270   Register VIndex;
3271   int OpOffset = 0;
3272   if (HasVIndex) {
3273     VIndex = MI.getOperand(3).getReg();
3274     OpOffset = 1;
3275   }
3276 
3277   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3278   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3279 
3280   unsigned Format = 0;
3281   if (IsTyped) {
3282     Format = MI.getOperand(5 + OpOffset).getImm();
3283     ++OpOffset;
3284   }
3285 
3286   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3287 
3288   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3289   if (TotalOffset != 0)
3290     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3291 
3292   unsigned Opc;
3293   if (IsTyped) {
3294     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3295                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3296   } else if (IsFormat) {
3297     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3298                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3299   } else {
3300     switch (MemSize) {
3301     case 1:
3302       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3303       break;
3304     case 2:
3305       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3306       break;
3307     default:
3308       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3309       break;
3310     }
3311   }
3312 
3313   if (!VIndex)
3314     VIndex = B.buildConstant(S32, 0).getReg(0);
3315 
3316   auto MIB = B.buildInstr(Opc)
3317     .addUse(VData)              // vdata
3318     .addUse(RSrc)               // rsrc
3319     .addUse(VIndex)             // vindex
3320     .addUse(VOffset)            // voffset
3321     .addUse(SOffset)            // soffset
3322     .addImm(ImmOffset);         // offset(imm)
3323 
3324   if (IsTyped)
3325     MIB.addImm(Format);
3326 
3327   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3328      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3329      .addMemOperand(MMO);
3330 
3331   MI.eraseFromParent();
3332   return true;
3333 }
3334 
3335 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3336                                              MachineRegisterInfo &MRI,
3337                                              MachineIRBuilder &B,
3338                                              bool IsFormat,
3339                                              bool IsTyped) const {
3340   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3341   MachineMemOperand *MMO = *MI.memoperands_begin();
3342   const int MemSize = MMO->getSize();
3343   const LLT S32 = LLT::scalar(32);
3344 
3345   Register Dst = MI.getOperand(0).getReg();
3346   Register RSrc = MI.getOperand(2).getReg();
3347 
3348   // The typed intrinsics add an immediate after the registers.
3349   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3350 
3351   // The struct intrinsic variants add one additional operand over raw.
3352   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3353   Register VIndex;
3354   int OpOffset = 0;
3355   if (HasVIndex) {
3356     VIndex = MI.getOperand(3).getReg();
3357     OpOffset = 1;
3358   }
3359 
3360   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3361   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3362 
3363   unsigned Format = 0;
3364   if (IsTyped) {
3365     Format = MI.getOperand(5 + OpOffset).getImm();
3366     ++OpOffset;
3367   }
3368 
3369   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3370   unsigned ImmOffset;
3371   unsigned TotalOffset;
3372 
3373   LLT Ty = MRI.getType(Dst);
3374   LLT EltTy = Ty.getScalarType();
3375   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3376   const bool Unpacked = ST.hasUnpackedD16VMem();
3377 
3378   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3379   if (TotalOffset != 0)
3380     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3381 
3382   unsigned Opc;
3383 
3384   if (IsTyped) {
3385     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3386                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3387   } else if (IsFormat) {
3388     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3389                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3390   } else {
3391     switch (MemSize) {
3392     case 1:
3393       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3394       break;
3395     case 2:
3396       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3397       break;
3398     default:
3399       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3400       break;
3401     }
3402   }
3403 
3404   Register LoadDstReg;
3405 
3406   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3407   LLT UnpackedTy = Ty.changeElementSize(32);
3408 
3409   if (IsExtLoad)
3410     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3411   else if (Unpacked && IsD16 && Ty.isVector())
3412     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3413   else
3414     LoadDstReg = Dst;
3415 
3416   if (!VIndex)
3417     VIndex = B.buildConstant(S32, 0).getReg(0);
3418 
3419   auto MIB = B.buildInstr(Opc)
3420     .addDef(LoadDstReg)         // vdata
3421     .addUse(RSrc)               // rsrc
3422     .addUse(VIndex)             // vindex
3423     .addUse(VOffset)            // voffset
3424     .addUse(SOffset)            // soffset
3425     .addImm(ImmOffset);         // offset(imm)
3426 
3427   if (IsTyped)
3428     MIB.addImm(Format);
3429 
3430   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3431      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3432      .addMemOperand(MMO);
3433 
3434   if (LoadDstReg != Dst) {
3435     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3436 
3437     // Widen result for extending loads was widened.
3438     if (IsExtLoad)
3439       B.buildTrunc(Dst, LoadDstReg);
3440     else {
3441       // Repack to original 16-bit vector result
3442       // FIXME: G_TRUNC should work, but legalization currently fails
3443       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3444       SmallVector<Register, 4> Repack;
3445       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3446         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3447       B.buildMerge(Dst, Repack);
3448     }
3449   }
3450 
3451   MI.eraseFromParent();
3452   return true;
3453 }
3454 
3455 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3456                                                MachineIRBuilder &B,
3457                                                bool IsInc) const {
3458   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3459                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3460   B.buildInstr(Opc)
3461     .addDef(MI.getOperand(0).getReg())
3462     .addUse(MI.getOperand(2).getReg())
3463     .addUse(MI.getOperand(3).getReg())
3464     .cloneMemRefs(MI);
3465   MI.eraseFromParent();
3466   return true;
3467 }
3468 
3469 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3470   switch (IntrID) {
3471   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3472   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3473     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3474   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3475   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3476     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3477   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3478   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3479     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3480   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3481   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3482     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3483   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3484   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3485     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3486   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3487   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3488     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3489   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3490   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3491     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3492   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3493   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3494     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3495   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3496   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3497     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3498   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3499   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3500     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3501   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3502   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3503     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3504   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3505   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3506     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3507   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3508   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3509     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3510   default:
3511     llvm_unreachable("unhandled atomic opcode");
3512   }
3513 }
3514 
3515 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3516                                                MachineIRBuilder &B,
3517                                                Intrinsic::ID IID) const {
3518   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3519                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3520 
3521   Register Dst = MI.getOperand(0).getReg();
3522   Register VData = MI.getOperand(2).getReg();
3523 
3524   Register CmpVal;
3525   int OpOffset = 0;
3526 
3527   if (IsCmpSwap) {
3528     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3529     ++OpOffset;
3530   }
3531 
3532   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3533   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3534 
3535   // The struct intrinsic variants add one additional operand over raw.
3536   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3537   Register VIndex;
3538   if (HasVIndex) {
3539     VIndex = MI.getOperand(4 + OpOffset).getReg();
3540     ++OpOffset;
3541   }
3542 
3543   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3544   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3545   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3546 
3547   MachineMemOperand *MMO = *MI.memoperands_begin();
3548 
3549   unsigned ImmOffset;
3550   unsigned TotalOffset;
3551   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3552   if (TotalOffset != 0)
3553     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3554 
3555   if (!VIndex)
3556     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3557 
3558   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3559     .addDef(Dst)
3560     .addUse(VData); // vdata
3561 
3562   if (IsCmpSwap)
3563     MIB.addReg(CmpVal);
3564 
3565   MIB.addUse(RSrc)               // rsrc
3566      .addUse(VIndex)             // vindex
3567      .addUse(VOffset)            // voffset
3568      .addUse(SOffset)            // soffset
3569      .addImm(ImmOffset)          // offset(imm)
3570      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3571      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3572      .addMemOperand(MMO);
3573 
3574   MI.eraseFromParent();
3575   return true;
3576 }
3577 
3578 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3579 /// vector with s16 typed elements.
3580 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3581                                         SmallVectorImpl<Register> &PackedAddrs,
3582                                         int AddrIdx, int DimIdx, int EndIdx,
3583                                         int NumGradients) {
3584   const LLT S16 = LLT::scalar(16);
3585   const LLT V2S16 = LLT::vector(2, 16);
3586 
3587   for (int I = AddrIdx; I < EndIdx; ++I) {
3588     MachineOperand &SrcOp = MI.getOperand(I);
3589     if (!SrcOp.isReg())
3590       continue; // _L to _LZ may have eliminated this.
3591 
3592     Register AddrReg = SrcOp.getReg();
3593 
3594     if (I < DimIdx) {
3595       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3596       PackedAddrs.push_back(AddrReg);
3597     } else {
3598       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3599       // derivatives dx/dh and dx/dv are packed with undef.
3600       if (((I + 1) >= EndIdx) ||
3601           ((NumGradients / 2) % 2 == 1 &&
3602            (I == DimIdx + (NumGradients / 2) - 1 ||
3603             I == DimIdx + NumGradients - 1)) ||
3604           // Check for _L to _LZ optimization
3605           !MI.getOperand(I + 1).isReg()) {
3606         PackedAddrs.push_back(
3607             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3608                 .getReg(0));
3609       } else {
3610         PackedAddrs.push_back(
3611             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3612                 .getReg(0));
3613         ++I;
3614       }
3615     }
3616   }
3617 }
3618 
3619 /// Convert from separate vaddr components to a single vector address register,
3620 /// and replace the remaining operands with $noreg.
3621 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3622                                      int DimIdx, int NumVAddrs) {
3623   const LLT S32 = LLT::scalar(32);
3624 
3625   SmallVector<Register, 8> AddrRegs;
3626   for (int I = 0; I != NumVAddrs; ++I) {
3627     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3628     if (SrcOp.isReg()) {
3629       AddrRegs.push_back(SrcOp.getReg());
3630       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3631     }
3632   }
3633 
3634   int NumAddrRegs = AddrRegs.size();
3635   if (NumAddrRegs != 1) {
3636     // Round up to 8 elements for v5-v7
3637     // FIXME: Missing intermediate sized register classes and instructions.
3638     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3639       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3640       auto Undef = B.buildUndef(S32);
3641       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3642       NumAddrRegs = RoundedNumRegs;
3643     }
3644 
3645     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3646     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3647   }
3648 
3649   for (int I = 1; I != NumVAddrs; ++I) {
3650     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3651     if (SrcOp.isReg())
3652       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3653   }
3654 }
3655 
3656 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3657 ///
3658 /// Depending on the subtarget, load/store with 16-bit element data need to be
3659 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3660 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3661 /// registers.
3662 ///
3663 /// We don't want to directly select image instructions just yet, but also want
3664 /// to exposes all register repacking to the legalizer/combiners. We also don't
3665 /// want a selected instrution entering RegBankSelect. In order to avoid
3666 /// defining a multitude of intermediate image instructions, directly hack on
3667 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3668 /// now unnecessary arguments with $noreg.
3669 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3670     MachineInstr &MI, MachineIRBuilder &B,
3671     GISelChangeObserver &Observer,
3672     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3673 
3674   const int NumDefs = MI.getNumExplicitDefs();
3675   bool IsTFE = NumDefs == 2;
3676   // We are only processing the operands of d16 image operations on subtargets
3677   // that use the unpacked register layout, or need to repack the TFE result.
3678 
3679   // TODO: Do we need to guard against already legalized intrinsics?
3680   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3681     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3682 
3683   MachineRegisterInfo *MRI = B.getMRI();
3684   const LLT S32 = LLT::scalar(32);
3685   const LLT S16 = LLT::scalar(16);
3686   const LLT V2S16 = LLT::vector(2, 16);
3687 
3688   // Index of first address argument
3689   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3690 
3691   int NumVAddrs, NumGradients;
3692   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3693   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3694     getDMaskIdx(BaseOpcode, NumDefs);
3695   unsigned DMask = 0;
3696 
3697   // Check for 16 bit addresses and pack if true.
3698   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3699   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3700   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3701   const bool IsG16 = GradTy == S16;
3702   const bool IsA16 = AddrTy == S16;
3703 
3704   int DMaskLanes = 0;
3705   if (!BaseOpcode->Atomic) {
3706     DMask = MI.getOperand(DMaskIdx).getImm();
3707     if (BaseOpcode->Gather4) {
3708       DMaskLanes = 4;
3709     } else if (DMask != 0) {
3710       DMaskLanes = countPopulation(DMask);
3711     } else if (!IsTFE && !BaseOpcode->Store) {
3712       // If dmask is 0, this is a no-op load. This can be eliminated.
3713       B.buildUndef(MI.getOperand(0));
3714       MI.eraseFromParent();
3715       return true;
3716     }
3717   }
3718 
3719   Observer.changingInstr(MI);
3720   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3721 
3722   unsigned NewOpcode = NumDefs == 0 ?
3723     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3724 
3725   // Track that we legalized this
3726   MI.setDesc(B.getTII().get(NewOpcode));
3727 
3728   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3729   // dmask to be at least 1 otherwise the instruction will fail
3730   if (IsTFE && DMask == 0) {
3731     DMask = 0x1;
3732     DMaskLanes = 1;
3733     MI.getOperand(DMaskIdx).setImm(DMask);
3734   }
3735 
3736   if (BaseOpcode->Atomic) {
3737     Register VData0 = MI.getOperand(2).getReg();
3738     LLT Ty = MRI->getType(VData0);
3739 
3740     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3741     if (Ty.isVector())
3742       return false;
3743 
3744     if (BaseOpcode->AtomicX2) {
3745       Register VData1 = MI.getOperand(3).getReg();
3746       // The two values are packed in one register.
3747       LLT PackedTy = LLT::vector(2, Ty);
3748       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3749       MI.getOperand(2).setReg(Concat.getReg(0));
3750       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3751     }
3752   }
3753 
3754   int CorrectedNumVAddrs = NumVAddrs;
3755 
3756   // Optimize _L to _LZ when _L is zero
3757   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3758         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3759     const ConstantFP *ConstantLod;
3760     const int LodIdx = AddrIdx + NumVAddrs - 1;
3761 
3762     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3763       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3764         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3765         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3766           LZMappingInfo->LZ, ImageDimIntr->Dim);
3767 
3768         // The starting indexes should remain in the same place.
3769         --NumVAddrs;
3770         --CorrectedNumVAddrs;
3771 
3772         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3773           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3774         MI.RemoveOperand(LodIdx);
3775       }
3776     }
3777   }
3778 
3779   // Optimize _mip away, when 'lod' is zero
3780   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3781     int64_t ConstantLod;
3782     const int LodIdx = AddrIdx + NumVAddrs - 1;
3783 
3784     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3785       if (ConstantLod == 0) {
3786         // TODO: Change intrinsic opcode and remove operand instead or replacing
3787         // it with 0, as the _L to _LZ handling is done above.
3788         MI.getOperand(LodIdx).ChangeToImmediate(0);
3789         --CorrectedNumVAddrs;
3790       }
3791     }
3792   }
3793 
3794   // Rewrite the addressing register layout before doing anything else.
3795   if (IsA16 || IsG16) {
3796     if (IsA16) {
3797       // Target must support the feature and gradients need to be 16 bit too
3798       if (!ST.hasA16() || !IsG16)
3799         return false;
3800     } else if (!ST.hasG16())
3801       return false;
3802 
3803     if (NumVAddrs > 1) {
3804       SmallVector<Register, 4> PackedRegs;
3805       // Don't compress addresses for G16
3806       const int PackEndIdx =
3807           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3808       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3809                                   PackEndIdx, NumGradients);
3810 
3811       if (!IsA16) {
3812         // Add uncompressed address
3813         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3814           int AddrReg = MI.getOperand(I).getReg();
3815           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3816           PackedRegs.push_back(AddrReg);
3817         }
3818       }
3819 
3820       // See also below in the non-a16 branch
3821       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3822 
3823       if (!UseNSA && PackedRegs.size() > 1) {
3824         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3825         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3826         PackedRegs[0] = Concat.getReg(0);
3827         PackedRegs.resize(1);
3828       }
3829 
3830       const int NumPacked = PackedRegs.size();
3831       for (int I = 0; I != NumVAddrs; ++I) {
3832         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3833         if (!SrcOp.isReg()) {
3834           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3835           continue;
3836         }
3837 
3838         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3839 
3840         if (I < NumPacked)
3841           SrcOp.setReg(PackedRegs[I]);
3842         else
3843           SrcOp.setReg(AMDGPU::NoRegister);
3844       }
3845     }
3846   } else {
3847     // If the register allocator cannot place the address registers contiguously
3848     // without introducing moves, then using the non-sequential address encoding
3849     // is always preferable, since it saves VALU instructions and is usually a
3850     // wash in terms of code size or even better.
3851     //
3852     // However, we currently have no way of hinting to the register allocator
3853     // that MIMG addresses should be placed contiguously when it is possible to
3854     // do so, so force non-NSA for the common 2-address case as a heuristic.
3855     //
3856     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3857     // allocation when possible.
3858     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3859 
3860     if (!UseNSA && NumVAddrs > 1)
3861       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3862   }
3863 
3864   int Flags = 0;
3865   if (IsA16)
3866     Flags |= 1;
3867   if (IsG16)
3868     Flags |= 2;
3869   MI.addOperand(MachineOperand::CreateImm(Flags));
3870 
3871   if (BaseOpcode->Store) { // No TFE for stores?
3872     // TODO: Handle dmask trim
3873     Register VData = MI.getOperand(1).getReg();
3874     LLT Ty = MRI->getType(VData);
3875     if (!Ty.isVector() || Ty.getElementType() != S16)
3876       return true;
3877 
3878     Register RepackedReg = handleD16VData(B, *MRI, VData);
3879     if (RepackedReg != VData) {
3880       MI.getOperand(1).setReg(RepackedReg);
3881     }
3882 
3883     return true;
3884   }
3885 
3886   Register DstReg = MI.getOperand(0).getReg();
3887   LLT Ty = MRI->getType(DstReg);
3888   const LLT EltTy = Ty.getScalarType();
3889   const bool IsD16 = Ty.getScalarType() == S16;
3890   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3891 
3892   // Confirm that the return type is large enough for the dmask specified
3893   if (NumElts < DMaskLanes)
3894     return false;
3895 
3896   if (NumElts > 4 || DMaskLanes > 4)
3897     return false;
3898 
3899   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3900   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3901 
3902   // The raw dword aligned data component of the load. The only legal cases
3903   // where this matters should be when using the packed D16 format, for
3904   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3905   LLT RoundedTy;
3906 
3907   // S32 vector to to cover all data, plus TFE result element.
3908   LLT TFETy;
3909 
3910   // Register type to use for each loaded component. Will be S32 or V2S16.
3911   LLT RegTy;
3912 
3913   if (IsD16 && ST.hasUnpackedD16VMem()) {
3914     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3915     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3916     RegTy = S32;
3917   } else {
3918     unsigned EltSize = EltTy.getSizeInBits();
3919     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3920     unsigned RoundedSize = 32 * RoundedElts;
3921     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3922     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3923     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3924   }
3925 
3926   // The return type does not need adjustment.
3927   // TODO: Should we change s16 case to s32 or <2 x s16>?
3928   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3929     return true;
3930 
3931   Register Dst1Reg;
3932 
3933   // Insert after the instruction.
3934   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3935 
3936   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3937   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3938   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3939   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3940 
3941   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3942 
3943   MI.getOperand(0).setReg(NewResultReg);
3944 
3945   // In the IR, TFE is supposed to be used with a 2 element struct return
3946   // type. The intruction really returns these two values in one contiguous
3947   // register, with one additional dword beyond the loaded data. Rewrite the
3948   // return type to use a single register result.
3949 
3950   if (IsTFE) {
3951     Dst1Reg = MI.getOperand(1).getReg();
3952     if (MRI->getType(Dst1Reg) != S32)
3953       return false;
3954 
3955     // TODO: Make sure the TFE operand bit is set.
3956     MI.RemoveOperand(1);
3957 
3958     // Handle the easy case that requires no repack instructions.
3959     if (Ty == S32) {
3960       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3961       return true;
3962     }
3963   }
3964 
3965   // Now figure out how to copy the new result register back into the old
3966   // result.
3967   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3968 
3969   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3970 
3971   if (ResultNumRegs == 1) {
3972     assert(!IsTFE);
3973     ResultRegs[0] = NewResultReg;
3974   } else {
3975     // We have to repack into a new vector of some kind.
3976     for (int I = 0; I != NumDataRegs; ++I)
3977       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3978     B.buildUnmerge(ResultRegs, NewResultReg);
3979 
3980     // Drop the final TFE element to get the data part. The TFE result is
3981     // directly written to the right place already.
3982     if (IsTFE)
3983       ResultRegs.resize(NumDataRegs);
3984   }
3985 
3986   // For an s16 scalar result, we form an s32 result with a truncate regardless
3987   // of packed vs. unpacked.
3988   if (IsD16 && !Ty.isVector()) {
3989     B.buildTrunc(DstReg, ResultRegs[0]);
3990     return true;
3991   }
3992 
3993   // Avoid a build/concat_vector of 1 entry.
3994   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3995     B.buildBitcast(DstReg, ResultRegs[0]);
3996     return true;
3997   }
3998 
3999   assert(Ty.isVector());
4000 
4001   if (IsD16) {
4002     // For packed D16 results with TFE enabled, all the data components are
4003     // S32. Cast back to the expected type.
4004     //
4005     // TODO: We don't really need to use load s32 elements. We would only need one
4006     // cast for the TFE result if a multiple of v2s16 was used.
4007     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4008       for (Register &Reg : ResultRegs)
4009         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4010     } else if (ST.hasUnpackedD16VMem()) {
4011       for (Register &Reg : ResultRegs)
4012         Reg = B.buildTrunc(S16, Reg).getReg(0);
4013     }
4014   }
4015 
4016   auto padWithUndef = [&](LLT Ty, int NumElts) {
4017     if (NumElts == 0)
4018       return;
4019     Register Undef = B.buildUndef(Ty).getReg(0);
4020     for (int I = 0; I != NumElts; ++I)
4021       ResultRegs.push_back(Undef);
4022   };
4023 
4024   // Pad out any elements eliminated due to the dmask.
4025   LLT ResTy = MRI->getType(ResultRegs[0]);
4026   if (!ResTy.isVector()) {
4027     padWithUndef(ResTy, NumElts - ResultRegs.size());
4028     B.buildBuildVector(DstReg, ResultRegs);
4029     return true;
4030   }
4031 
4032   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4033   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4034 
4035   // Deal with the one annoying legal case.
4036   const LLT V3S16 = LLT::vector(3, 16);
4037   if (Ty == V3S16) {
4038     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4039     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4040     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4041     return true;
4042   }
4043 
4044   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4045   B.buildConcatVectors(DstReg, ResultRegs);
4046   return true;
4047 }
4048 
4049 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4050   MachineInstr &MI, MachineIRBuilder &B,
4051   GISelChangeObserver &Observer) const {
4052   Register Dst = MI.getOperand(0).getReg();
4053   LLT Ty = B.getMRI()->getType(Dst);
4054   unsigned Size = Ty.getSizeInBits();
4055   MachineFunction &MF = B.getMF();
4056 
4057   Observer.changingInstr(MI);
4058 
4059   // FIXME: We don't really need this intermediate instruction. The intrinsic
4060   // should be fixed to have a memory operand. Since it's readnone, we're not
4061   // allowed to add one.
4062   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4063   MI.RemoveOperand(1); // Remove intrinsic ID
4064 
4065   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4066   // TODO: Should this use datalayout alignment?
4067   const unsigned MemSize = (Size + 7) / 8;
4068   const Align MemAlign(4);
4069   MachineMemOperand *MMO = MF.getMachineMemOperand(
4070       MachinePointerInfo(),
4071       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4072           MachineMemOperand::MOInvariant,
4073       MemSize, MemAlign);
4074   MI.addMemOperand(MF, MMO);
4075 
4076   // There are no 96-bit result scalar loads, but widening to 128-bit should
4077   // always be legal. We may need to restore this to a 96-bit result if it turns
4078   // out this needs to be converted to a vector load during RegBankSelect.
4079   if (!isPowerOf2_32(Size)) {
4080     LegalizerHelper Helper(MF, *this, Observer, B);
4081 
4082     if (Ty.isVector())
4083       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4084     else
4085       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4086   }
4087 
4088   Observer.changedInstr(MI);
4089   return true;
4090 }
4091 
4092 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4093                                                 MachineRegisterInfo &MRI,
4094                                                 MachineIRBuilder &B) const {
4095   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4096   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4097       !ST.isTrapHandlerEnabled()) {
4098     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4099   } else {
4100     // Pass queue pointer to trap handler as input, and insert trap instruction
4101     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4102     const ArgDescriptor *Arg =
4103         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4104     if (!Arg)
4105       return false;
4106     MachineRegisterInfo &MRI = *B.getMRI();
4107     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4108     Register LiveIn = getLiveInRegister(
4109         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4110         /*InsertLiveInCopy=*/false);
4111     if (!loadInputValue(LiveIn, B, Arg))
4112       return false;
4113     B.buildCopy(SGPR01, LiveIn);
4114     B.buildInstr(AMDGPU::S_TRAP)
4115         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4116         .addReg(SGPR01, RegState::Implicit);
4117   }
4118 
4119   MI.eraseFromParent();
4120   return true;
4121 }
4122 
4123 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4124     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4125   // Is non-HSA path or trap-handler disabled? then, report a warning
4126   // accordingly
4127   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4128       !ST.isTrapHandlerEnabled()) {
4129     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4130                                      "debugtrap handler not supported",
4131                                      MI.getDebugLoc(), DS_Warning);
4132     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4133     Ctx.diagnose(NoTrap);
4134   } else {
4135     // Insert debug-trap instruction
4136     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4137   }
4138 
4139   MI.eraseFromParent();
4140   return true;
4141 }
4142 
4143 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4144                                             MachineInstr &MI) const {
4145   MachineIRBuilder &B = Helper.MIRBuilder;
4146   MachineRegisterInfo &MRI = *B.getMRI();
4147 
4148   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4149   auto IntrID = MI.getIntrinsicID();
4150   switch (IntrID) {
4151   case Intrinsic::amdgcn_if:
4152   case Intrinsic::amdgcn_else: {
4153     MachineInstr *Br = nullptr;
4154     MachineBasicBlock *UncondBrTarget = nullptr;
4155     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4156       const SIRegisterInfo *TRI
4157         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4158 
4159       Register Def = MI.getOperand(1).getReg();
4160       Register Use = MI.getOperand(3).getReg();
4161 
4162       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4163       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4164       if (IntrID == Intrinsic::amdgcn_if) {
4165         B.buildInstr(AMDGPU::SI_IF)
4166           .addDef(Def)
4167           .addUse(Use)
4168           .addMBB(UncondBrTarget);
4169       } else {
4170         B.buildInstr(AMDGPU::SI_ELSE)
4171           .addDef(Def)
4172           .addUse(Use)
4173           .addMBB(UncondBrTarget)
4174           .addImm(0);
4175       }
4176 
4177       if (Br) {
4178         Br->getOperand(0).setMBB(CondBrTarget);
4179       } else {
4180         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4181         // since we're swapping branch targets it needs to be reinserted.
4182         // FIXME: IRTranslator should probably not do this
4183         B.buildBr(*CondBrTarget);
4184       }
4185 
4186       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4187       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4188       MI.eraseFromParent();
4189       BrCond->eraseFromParent();
4190       return true;
4191     }
4192 
4193     return false;
4194   }
4195   case Intrinsic::amdgcn_loop: {
4196     MachineInstr *Br = nullptr;
4197     MachineBasicBlock *UncondBrTarget = nullptr;
4198     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4199       const SIRegisterInfo *TRI
4200         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4201 
4202       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4203       Register Reg = MI.getOperand(2).getReg();
4204 
4205       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4206       B.buildInstr(AMDGPU::SI_LOOP)
4207         .addUse(Reg)
4208         .addMBB(UncondBrTarget);
4209 
4210       if (Br)
4211         Br->getOperand(0).setMBB(CondBrTarget);
4212       else
4213         B.buildBr(*CondBrTarget);
4214 
4215       MI.eraseFromParent();
4216       BrCond->eraseFromParent();
4217       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4218       return true;
4219     }
4220 
4221     return false;
4222   }
4223   case Intrinsic::amdgcn_kernarg_segment_ptr:
4224     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4225       // This only makes sense to call in a kernel, so just lower to null.
4226       B.buildConstant(MI.getOperand(0).getReg(), 0);
4227       MI.eraseFromParent();
4228       return true;
4229     }
4230 
4231     return legalizePreloadedArgIntrin(
4232       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4233   case Intrinsic::amdgcn_implicitarg_ptr:
4234     return legalizeImplicitArgPtr(MI, MRI, B);
4235   case Intrinsic::amdgcn_workitem_id_x:
4236     return legalizePreloadedArgIntrin(MI, MRI, B,
4237                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4238   case Intrinsic::amdgcn_workitem_id_y:
4239     return legalizePreloadedArgIntrin(MI, MRI, B,
4240                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4241   case Intrinsic::amdgcn_workitem_id_z:
4242     return legalizePreloadedArgIntrin(MI, MRI, B,
4243                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4244   case Intrinsic::amdgcn_workgroup_id_x:
4245     return legalizePreloadedArgIntrin(MI, MRI, B,
4246                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4247   case Intrinsic::amdgcn_workgroup_id_y:
4248     return legalizePreloadedArgIntrin(MI, MRI, B,
4249                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4250   case Intrinsic::amdgcn_workgroup_id_z:
4251     return legalizePreloadedArgIntrin(MI, MRI, B,
4252                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4253   case Intrinsic::amdgcn_dispatch_ptr:
4254     return legalizePreloadedArgIntrin(MI, MRI, B,
4255                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4256   case Intrinsic::amdgcn_queue_ptr:
4257     return legalizePreloadedArgIntrin(MI, MRI, B,
4258                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4259   case Intrinsic::amdgcn_implicit_buffer_ptr:
4260     return legalizePreloadedArgIntrin(
4261       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4262   case Intrinsic::amdgcn_dispatch_id:
4263     return legalizePreloadedArgIntrin(MI, MRI, B,
4264                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4265   case Intrinsic::amdgcn_fdiv_fast:
4266     return legalizeFDIVFastIntrin(MI, MRI, B);
4267   case Intrinsic::amdgcn_is_shared:
4268     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4269   case Intrinsic::amdgcn_is_private:
4270     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4271   case Intrinsic::amdgcn_wavefrontsize: {
4272     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4273     MI.eraseFromParent();
4274     return true;
4275   }
4276   case Intrinsic::amdgcn_s_buffer_load:
4277     return legalizeSBufferLoad(MI, B, Helper.Observer);
4278   case Intrinsic::amdgcn_raw_buffer_store:
4279   case Intrinsic::amdgcn_struct_buffer_store:
4280     return legalizeBufferStore(MI, MRI, B, false, false);
4281   case Intrinsic::amdgcn_raw_buffer_store_format:
4282   case Intrinsic::amdgcn_struct_buffer_store_format:
4283     return legalizeBufferStore(MI, MRI, B, false, true);
4284   case Intrinsic::amdgcn_raw_tbuffer_store:
4285   case Intrinsic::amdgcn_struct_tbuffer_store:
4286     return legalizeBufferStore(MI, MRI, B, true, true);
4287   case Intrinsic::amdgcn_raw_buffer_load:
4288   case Intrinsic::amdgcn_struct_buffer_load:
4289     return legalizeBufferLoad(MI, MRI, B, false, false);
4290   case Intrinsic::amdgcn_raw_buffer_load_format:
4291   case Intrinsic::amdgcn_struct_buffer_load_format:
4292     return legalizeBufferLoad(MI, MRI, B, true, false);
4293   case Intrinsic::amdgcn_raw_tbuffer_load:
4294   case Intrinsic::amdgcn_struct_tbuffer_load:
4295     return legalizeBufferLoad(MI, MRI, B, true, true);
4296   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4297   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4298   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4299   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4300   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4301   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4302   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4303   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4304   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4305   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4306   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4307   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4308   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4309   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4310   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4311   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4312   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4313   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4314   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4315   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4316   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4317   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4318   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4319   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4320   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4321   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4322     return legalizeBufferAtomic(MI, B, IntrID);
4323   case Intrinsic::amdgcn_atomic_inc:
4324     return legalizeAtomicIncDec(MI, B, true);
4325   case Intrinsic::amdgcn_atomic_dec:
4326     return legalizeAtomicIncDec(MI, B, false);
4327   case Intrinsic::trap:
4328     return legalizeTrapIntrinsic(MI, MRI, B);
4329   case Intrinsic::debugtrap:
4330     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4331   default: {
4332     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4333             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4334       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4335     return true;
4336   }
4337   }
4338 
4339   return true;
4340 }
4341