1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/LowLevelType.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/TargetFrameLowering.h"
25 #include "llvm/CodeGen/TargetInstrInfo.h"
26 #include "llvm/CodeGen/TargetLowering.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/TargetSubtargetInfo.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/MathExtras.h"
32 #include "llvm/Support/raw_ostream.h"
33 
34 #define DEBUG_TYPE "legalizer"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace MIPatternMatch;
39 
40 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
41 ///
42 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
43 /// with any leftover piece as type \p LeftoverTy
44 ///
45 /// Returns -1 in the first element of the pair if the breakdown is not
46 /// satisfiable.
47 static std::pair<int, int>
48 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
49   assert(!LeftoverTy.isValid() && "this is an out argument");
50 
51   unsigned Size = OrigTy.getSizeInBits();
52   unsigned NarrowSize = NarrowTy.getSizeInBits();
53   unsigned NumParts = Size / NarrowSize;
54   unsigned LeftoverSize = Size - NumParts * NarrowSize;
55   assert(Size > NarrowSize);
56 
57   if (LeftoverSize == 0)
58     return {NumParts, 0};
59 
60   if (NarrowTy.isVector()) {
61     unsigned EltSize = OrigTy.getScalarSizeInBits();
62     if (LeftoverSize % EltSize != 0)
63       return {-1, -1};
64     LeftoverTy = LLT::scalarOrVector(
65         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
66   } else {
67     LeftoverTy = LLT::scalar(LeftoverSize);
68   }
69 
70   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
71   return std::make_pair(NumParts, NumLeftover);
72 }
73 
74 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
75 
76   if (!Ty.isScalar())
77     return nullptr;
78 
79   switch (Ty.getSizeInBits()) {
80   case 16:
81     return Type::getHalfTy(Ctx);
82   case 32:
83     return Type::getFloatTy(Ctx);
84   case 64:
85     return Type::getDoubleTy(Ctx);
86   case 80:
87     return Type::getX86_FP80Ty(Ctx);
88   case 128:
89     return Type::getFP128Ty(Ctx);
90   default:
91     return nullptr;
92   }
93 }
94 
95 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
96                                  GISelChangeObserver &Observer,
97                                  MachineIRBuilder &Builder)
98     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
99       LI(*MF.getSubtarget().getLegalizerInfo()),
100       TLI(*MF.getSubtarget().getTargetLowering()) { }
101 
102 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
103                                  GISelChangeObserver &Observer,
104                                  MachineIRBuilder &B)
105   : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
106     TLI(*MF.getSubtarget().getTargetLowering()) { }
107 
108 LegalizerHelper::LegalizeResult
109 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
110                                    LostDebugLocObserver &LocObserver) {
111   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
112 
113   MIRBuilder.setInstrAndDebugLoc(MI);
114 
115   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
116       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
117     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
118   auto Step = LI.getAction(MI, MRI);
119   switch (Step.Action) {
120   case Legal:
121     LLVM_DEBUG(dbgs() << ".. Already legal\n");
122     return AlreadyLegal;
123   case Libcall:
124     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
125     return libcall(MI, LocObserver);
126   case NarrowScalar:
127     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
128     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
129   case WidenScalar:
130     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
131     return widenScalar(MI, Step.TypeIdx, Step.NewType);
132   case Bitcast:
133     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
134     return bitcast(MI, Step.TypeIdx, Step.NewType);
135   case Lower:
136     LLVM_DEBUG(dbgs() << ".. Lower\n");
137     return lower(MI, Step.TypeIdx, Step.NewType);
138   case FewerElements:
139     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
140     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
141   case MoreElements:
142     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
143     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
144   case Custom:
145     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
146     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
147   default:
148     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
149     return UnableToLegalize;
150   }
151 }
152 
153 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
154                                    SmallVectorImpl<Register> &VRegs) {
155   for (int i = 0; i < NumParts; ++i)
156     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
157   MIRBuilder.buildUnmerge(VRegs, Reg);
158 }
159 
160 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
161                                    LLT MainTy, LLT &LeftoverTy,
162                                    SmallVectorImpl<Register> &VRegs,
163                                    SmallVectorImpl<Register> &LeftoverRegs) {
164   assert(!LeftoverTy.isValid() && "this is an out argument");
165 
166   unsigned RegSize = RegTy.getSizeInBits();
167   unsigned MainSize = MainTy.getSizeInBits();
168   unsigned NumParts = RegSize / MainSize;
169   unsigned LeftoverSize = RegSize - NumParts * MainSize;
170 
171   // Use an unmerge when possible.
172   if (LeftoverSize == 0) {
173     for (unsigned I = 0; I < NumParts; ++I)
174       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
175     MIRBuilder.buildUnmerge(VRegs, Reg);
176     return true;
177   }
178 
179   if (MainTy.isVector()) {
180     unsigned EltSize = MainTy.getScalarSizeInBits();
181     if (LeftoverSize % EltSize != 0)
182       return false;
183     LeftoverTy = LLT::scalarOrVector(
184         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
185   } else {
186     LeftoverTy = LLT::scalar(LeftoverSize);
187   }
188 
189   // For irregular sizes, extract the individual parts.
190   for (unsigned I = 0; I != NumParts; ++I) {
191     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
192     VRegs.push_back(NewReg);
193     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
194   }
195 
196   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
197        Offset += LeftoverSize) {
198     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
199     LeftoverRegs.push_back(NewReg);
200     MIRBuilder.buildExtract(NewReg, Reg, Offset);
201   }
202 
203   return true;
204 }
205 
206 void LegalizerHelper::insertParts(Register DstReg,
207                                   LLT ResultTy, LLT PartTy,
208                                   ArrayRef<Register> PartRegs,
209                                   LLT LeftoverTy,
210                                   ArrayRef<Register> LeftoverRegs) {
211   if (!LeftoverTy.isValid()) {
212     assert(LeftoverRegs.empty());
213 
214     if (!ResultTy.isVector()) {
215       MIRBuilder.buildMerge(DstReg, PartRegs);
216       return;
217     }
218 
219     if (PartTy.isVector())
220       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
221     else
222       MIRBuilder.buildBuildVector(DstReg, PartRegs);
223     return;
224   }
225 
226   SmallVector<Register> GCDRegs;
227   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
228   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
229     extractGCDType(GCDRegs, GCDTy, PartReg);
230   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
231   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
232 }
233 
234 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
235 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
236                               const MachineInstr &MI) {
237   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
238 
239   const int StartIdx = Regs.size();
240   const int NumResults = MI.getNumOperands() - 1;
241   Regs.resize(Regs.size() + NumResults);
242   for (int I = 0; I != NumResults; ++I)
243     Regs[StartIdx + I] = MI.getOperand(I).getReg();
244 }
245 
246 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
247                                      LLT GCDTy, Register SrcReg) {
248   LLT SrcTy = MRI.getType(SrcReg);
249   if (SrcTy == GCDTy) {
250     // If the source already evenly divides the result type, we don't need to do
251     // anything.
252     Parts.push_back(SrcReg);
253   } else {
254     // Need to split into common type sized pieces.
255     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
256     getUnmergeResults(Parts, *Unmerge);
257   }
258 }
259 
260 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
261                                     LLT NarrowTy, Register SrcReg) {
262   LLT SrcTy = MRI.getType(SrcReg);
263   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
264   extractGCDType(Parts, GCDTy, SrcReg);
265   return GCDTy;
266 }
267 
268 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
269                                          SmallVectorImpl<Register> &VRegs,
270                                          unsigned PadStrategy) {
271   LLT LCMTy = getLCMType(DstTy, NarrowTy);
272 
273   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
274   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
275   int NumOrigSrc = VRegs.size();
276 
277   Register PadReg;
278 
279   // Get a value we can use to pad the source value if the sources won't evenly
280   // cover the result type.
281   if (NumOrigSrc < NumParts * NumSubParts) {
282     if (PadStrategy == TargetOpcode::G_ZEXT)
283       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
284     else if (PadStrategy == TargetOpcode::G_ANYEXT)
285       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
286     else {
287       assert(PadStrategy == TargetOpcode::G_SEXT);
288 
289       // Shift the sign bit of the low register through the high register.
290       auto ShiftAmt =
291         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
292       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
293     }
294   }
295 
296   // Registers for the final merge to be produced.
297   SmallVector<Register, 4> Remerge(NumParts);
298 
299   // Registers needed for intermediate merges, which will be merged into a
300   // source for Remerge.
301   SmallVector<Register, 4> SubMerge(NumSubParts);
302 
303   // Once we've fully read off the end of the original source bits, we can reuse
304   // the same high bits for remaining padding elements.
305   Register AllPadReg;
306 
307   // Build merges to the LCM type to cover the original result type.
308   for (int I = 0; I != NumParts; ++I) {
309     bool AllMergePartsArePadding = true;
310 
311     // Build the requested merges to the requested type.
312     for (int J = 0; J != NumSubParts; ++J) {
313       int Idx = I * NumSubParts + J;
314       if (Idx >= NumOrigSrc) {
315         SubMerge[J] = PadReg;
316         continue;
317       }
318 
319       SubMerge[J] = VRegs[Idx];
320 
321       // There are meaningful bits here we can't reuse later.
322       AllMergePartsArePadding = false;
323     }
324 
325     // If we've filled up a complete piece with padding bits, we can directly
326     // emit the natural sized constant if applicable, rather than a merge of
327     // smaller constants.
328     if (AllMergePartsArePadding && !AllPadReg) {
329       if (PadStrategy == TargetOpcode::G_ANYEXT)
330         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
331       else if (PadStrategy == TargetOpcode::G_ZEXT)
332         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
333 
334       // If this is a sign extension, we can't materialize a trivial constant
335       // with the right type and have to produce a merge.
336     }
337 
338     if (AllPadReg) {
339       // Avoid creating additional instructions if we're just adding additional
340       // copies of padding bits.
341       Remerge[I] = AllPadReg;
342       continue;
343     }
344 
345     if (NumSubParts == 1)
346       Remerge[I] = SubMerge[0];
347     else
348       Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
349 
350     // In the sign extend padding case, re-use the first all-signbit merge.
351     if (AllMergePartsArePadding && !AllPadReg)
352       AllPadReg = Remerge[I];
353   }
354 
355   VRegs = std::move(Remerge);
356   return LCMTy;
357 }
358 
359 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
360                                                ArrayRef<Register> RemergeRegs) {
361   LLT DstTy = MRI.getType(DstReg);
362 
363   // Create the merge to the widened source, and extract the relevant bits into
364   // the result.
365 
366   if (DstTy == LCMTy) {
367     MIRBuilder.buildMerge(DstReg, RemergeRegs);
368     return;
369   }
370 
371   auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
372   if (DstTy.isScalar() && LCMTy.isScalar()) {
373     MIRBuilder.buildTrunc(DstReg, Remerge);
374     return;
375   }
376 
377   if (LCMTy.isVector()) {
378     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
379     SmallVector<Register, 8> UnmergeDefs(NumDefs);
380     UnmergeDefs[0] = DstReg;
381     for (unsigned I = 1; I != NumDefs; ++I)
382       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
383 
384     MIRBuilder.buildUnmerge(UnmergeDefs,
385                             MIRBuilder.buildMerge(LCMTy, RemergeRegs));
386     return;
387   }
388 
389   llvm_unreachable("unhandled case");
390 }
391 
392 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
393 #define RTLIBCASE_INT(LibcallPrefix)                                           \
394   do {                                                                         \
395     switch (Size) {                                                            \
396     case 32:                                                                   \
397       return RTLIB::LibcallPrefix##32;                                         \
398     case 64:                                                                   \
399       return RTLIB::LibcallPrefix##64;                                         \
400     case 128:                                                                  \
401       return RTLIB::LibcallPrefix##128;                                        \
402     default:                                                                   \
403       llvm_unreachable("unexpected size");                                     \
404     }                                                                          \
405   } while (0)
406 
407 #define RTLIBCASE(LibcallPrefix)                                               \
408   do {                                                                         \
409     switch (Size) {                                                            \
410     case 32:                                                                   \
411       return RTLIB::LibcallPrefix##32;                                         \
412     case 64:                                                                   \
413       return RTLIB::LibcallPrefix##64;                                         \
414     case 80:                                                                   \
415       return RTLIB::LibcallPrefix##80;                                         \
416     case 128:                                                                  \
417       return RTLIB::LibcallPrefix##128;                                        \
418     default:                                                                   \
419       llvm_unreachable("unexpected size");                                     \
420     }                                                                          \
421   } while (0)
422 
423   switch (Opcode) {
424   case TargetOpcode::G_SDIV:
425     RTLIBCASE_INT(SDIV_I);
426   case TargetOpcode::G_UDIV:
427     RTLIBCASE_INT(UDIV_I);
428   case TargetOpcode::G_SREM:
429     RTLIBCASE_INT(SREM_I);
430   case TargetOpcode::G_UREM:
431     RTLIBCASE_INT(UREM_I);
432   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
433     RTLIBCASE_INT(CTLZ_I);
434   case TargetOpcode::G_FADD:
435     RTLIBCASE(ADD_F);
436   case TargetOpcode::G_FSUB:
437     RTLIBCASE(SUB_F);
438   case TargetOpcode::G_FMUL:
439     RTLIBCASE(MUL_F);
440   case TargetOpcode::G_FDIV:
441     RTLIBCASE(DIV_F);
442   case TargetOpcode::G_FEXP:
443     RTLIBCASE(EXP_F);
444   case TargetOpcode::G_FEXP2:
445     RTLIBCASE(EXP2_F);
446   case TargetOpcode::G_FREM:
447     RTLIBCASE(REM_F);
448   case TargetOpcode::G_FPOW:
449     RTLIBCASE(POW_F);
450   case TargetOpcode::G_FMA:
451     RTLIBCASE(FMA_F);
452   case TargetOpcode::G_FSIN:
453     RTLIBCASE(SIN_F);
454   case TargetOpcode::G_FCOS:
455     RTLIBCASE(COS_F);
456   case TargetOpcode::G_FLOG10:
457     RTLIBCASE(LOG10_F);
458   case TargetOpcode::G_FLOG:
459     RTLIBCASE(LOG_F);
460   case TargetOpcode::G_FLOG2:
461     RTLIBCASE(LOG2_F);
462   case TargetOpcode::G_FCEIL:
463     RTLIBCASE(CEIL_F);
464   case TargetOpcode::G_FFLOOR:
465     RTLIBCASE(FLOOR_F);
466   case TargetOpcode::G_FMINNUM:
467     RTLIBCASE(FMIN_F);
468   case TargetOpcode::G_FMAXNUM:
469     RTLIBCASE(FMAX_F);
470   case TargetOpcode::G_FSQRT:
471     RTLIBCASE(SQRT_F);
472   case TargetOpcode::G_FRINT:
473     RTLIBCASE(RINT_F);
474   case TargetOpcode::G_FNEARBYINT:
475     RTLIBCASE(NEARBYINT_F);
476   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
477     RTLIBCASE(ROUNDEVEN_F);
478   }
479   llvm_unreachable("Unknown libcall function");
480 }
481 
482 /// True if an instruction is in tail position in its caller. Intended for
483 /// legalizing libcalls as tail calls when possible.
484 static bool isLibCallInTailPosition(MachineInstr &MI,
485                                     const TargetInstrInfo &TII,
486                                     MachineRegisterInfo &MRI) {
487   MachineBasicBlock &MBB = *MI.getParent();
488   const Function &F = MBB.getParent()->getFunction();
489 
490   // Conservatively require the attributes of the call to match those of
491   // the return. Ignore NoAlias and NonNull because they don't affect the
492   // call sequence.
493   AttributeList CallerAttrs = F.getAttributes();
494   if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
495           .removeAttribute(Attribute::NoAlias)
496           .removeAttribute(Attribute::NonNull)
497           .hasAttributes())
498     return false;
499 
500   // It's not safe to eliminate the sign / zero extension of the return value.
501   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
502       CallerAttrs.hasRetAttr(Attribute::SExt))
503     return false;
504 
505   // Only tail call if the following instruction is a standard return or if we
506   // have a `thisreturn` callee, and a sequence like:
507   //
508   //   G_MEMCPY %0, %1, %2
509   //   $x0 = COPY %0
510   //   RET_ReallyLR implicit $x0
511   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
512   if (Next != MBB.instr_end() && Next->isCopy()) {
513     switch (MI.getOpcode()) {
514     default:
515       llvm_unreachable("unsupported opcode");
516     case TargetOpcode::G_BZERO:
517       return false;
518     case TargetOpcode::G_MEMCPY:
519     case TargetOpcode::G_MEMMOVE:
520     case TargetOpcode::G_MEMSET:
521       break;
522     }
523 
524     Register VReg = MI.getOperand(0).getReg();
525     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
526       return false;
527 
528     Register PReg = Next->getOperand(0).getReg();
529     if (!PReg.isPhysical())
530       return false;
531 
532     auto Ret = next_nodbg(Next, MBB.instr_end());
533     if (Ret == MBB.instr_end() || !Ret->isReturn())
534       return false;
535 
536     if (Ret->getNumImplicitOperands() != 1)
537       return false;
538 
539     if (PReg != Ret->getOperand(0).getReg())
540       return false;
541 
542     // Skip over the COPY that we just validated.
543     Next = Ret;
544   }
545 
546   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
547     return false;
548 
549   return true;
550 }
551 
552 LegalizerHelper::LegalizeResult
553 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
554                     const CallLowering::ArgInfo &Result,
555                     ArrayRef<CallLowering::ArgInfo> Args,
556                     const CallingConv::ID CC) {
557   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
558 
559   CallLowering::CallLoweringInfo Info;
560   Info.CallConv = CC;
561   Info.Callee = MachineOperand::CreateES(Name);
562   Info.OrigRet = Result;
563   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
564   if (!CLI.lowerCall(MIRBuilder, Info))
565     return LegalizerHelper::UnableToLegalize;
566 
567   return LegalizerHelper::Legalized;
568 }
569 
570 LegalizerHelper::LegalizeResult
571 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
572                     const CallLowering::ArgInfo &Result,
573                     ArrayRef<CallLowering::ArgInfo> Args) {
574   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
575   const char *Name = TLI.getLibcallName(Libcall);
576   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
577   return createLibcall(MIRBuilder, Name, Result, Args, CC);
578 }
579 
580 // Useful for libcalls where all operands have the same type.
581 static LegalizerHelper::LegalizeResult
582 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
583               Type *OpType) {
584   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
585 
586   // FIXME: What does the original arg index mean here?
587   SmallVector<CallLowering::ArgInfo, 3> Args;
588   for (unsigned i = 1; i < MI.getNumOperands(); i++)
589     Args.push_back({MI.getOperand(i).getReg(), OpType, 0});
590   return createLibcall(MIRBuilder, Libcall,
591                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
592 }
593 
594 LegalizerHelper::LegalizeResult
595 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
596                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
597   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
598 
599   SmallVector<CallLowering::ArgInfo, 3> Args;
600   // Add all the args, except for the last which is an imm denoting 'tail'.
601   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
602     Register Reg = MI.getOperand(i).getReg();
603 
604     // Need derive an IR type for call lowering.
605     LLT OpLLT = MRI.getType(Reg);
606     Type *OpTy = nullptr;
607     if (OpLLT.isPointer())
608       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
609     else
610       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
611     Args.push_back({Reg, OpTy, 0});
612   }
613 
614   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
615   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
616   RTLIB::Libcall RTLibcall;
617   unsigned Opc = MI.getOpcode();
618   switch (Opc) {
619   case TargetOpcode::G_BZERO:
620     RTLibcall = RTLIB::BZERO;
621     break;
622   case TargetOpcode::G_MEMCPY:
623     RTLibcall = RTLIB::MEMCPY;
624     Args[0].Flags[0].setReturned();
625     break;
626   case TargetOpcode::G_MEMMOVE:
627     RTLibcall = RTLIB::MEMMOVE;
628     Args[0].Flags[0].setReturned();
629     break;
630   case TargetOpcode::G_MEMSET:
631     RTLibcall = RTLIB::MEMSET;
632     Args[0].Flags[0].setReturned();
633     break;
634   default:
635     llvm_unreachable("unsupported opcode");
636   }
637   const char *Name = TLI.getLibcallName(RTLibcall);
638 
639   // Unsupported libcall on the target.
640   if (!Name) {
641     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
642                       << MIRBuilder.getTII().getName(Opc) << "\n");
643     return LegalizerHelper::UnableToLegalize;
644   }
645 
646   CallLowering::CallLoweringInfo Info;
647   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
648   Info.Callee = MachineOperand::CreateES(Name);
649   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
650   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
651                     isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
652 
653   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
654   if (!CLI.lowerCall(MIRBuilder, Info))
655     return LegalizerHelper::UnableToLegalize;
656 
657   if (Info.LoweredTailCall) {
658     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
659 
660     // Check debug locations before removing the return.
661     LocObserver.checkpoint(true);
662 
663     // We must have a return following the call (or debug insts) to get past
664     // isLibCallInTailPosition.
665     do {
666       MachineInstr *Next = MI.getNextNode();
667       assert(Next &&
668              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
669              "Expected instr following MI to be return or debug inst?");
670       // We lowered a tail call, so the call is now the return from the block.
671       // Delete the old return.
672       Next->eraseFromParent();
673     } while (MI.getNextNode());
674 
675     // We expect to lose the debug location from the return.
676     LocObserver.checkpoint(false);
677   }
678 
679   return LegalizerHelper::Legalized;
680 }
681 
682 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
683                                        Type *FromType) {
684   auto ToMVT = MVT::getVT(ToType);
685   auto FromMVT = MVT::getVT(FromType);
686 
687   switch (Opcode) {
688   case TargetOpcode::G_FPEXT:
689     return RTLIB::getFPEXT(FromMVT, ToMVT);
690   case TargetOpcode::G_FPTRUNC:
691     return RTLIB::getFPROUND(FromMVT, ToMVT);
692   case TargetOpcode::G_FPTOSI:
693     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
694   case TargetOpcode::G_FPTOUI:
695     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
696   case TargetOpcode::G_SITOFP:
697     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
698   case TargetOpcode::G_UITOFP:
699     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
700   }
701   llvm_unreachable("Unsupported libcall function");
702 }
703 
704 static LegalizerHelper::LegalizeResult
705 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
706                   Type *FromType) {
707   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
708   return createLibcall(MIRBuilder, Libcall,
709                        {MI.getOperand(0).getReg(), ToType, 0},
710                        {{MI.getOperand(1).getReg(), FromType, 0}});
711 }
712 
713 LegalizerHelper::LegalizeResult
714 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
715   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
716   unsigned Size = LLTy.getSizeInBits();
717   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
718 
719   switch (MI.getOpcode()) {
720   default:
721     return UnableToLegalize;
722   case TargetOpcode::G_SDIV:
723   case TargetOpcode::G_UDIV:
724   case TargetOpcode::G_SREM:
725   case TargetOpcode::G_UREM:
726   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
727     Type *HLTy = IntegerType::get(Ctx, Size);
728     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
729     if (Status != Legalized)
730       return Status;
731     break;
732   }
733   case TargetOpcode::G_FADD:
734   case TargetOpcode::G_FSUB:
735   case TargetOpcode::G_FMUL:
736   case TargetOpcode::G_FDIV:
737   case TargetOpcode::G_FMA:
738   case TargetOpcode::G_FPOW:
739   case TargetOpcode::G_FREM:
740   case TargetOpcode::G_FCOS:
741   case TargetOpcode::G_FSIN:
742   case TargetOpcode::G_FLOG10:
743   case TargetOpcode::G_FLOG:
744   case TargetOpcode::G_FLOG2:
745   case TargetOpcode::G_FEXP:
746   case TargetOpcode::G_FEXP2:
747   case TargetOpcode::G_FCEIL:
748   case TargetOpcode::G_FFLOOR:
749   case TargetOpcode::G_FMINNUM:
750   case TargetOpcode::G_FMAXNUM:
751   case TargetOpcode::G_FSQRT:
752   case TargetOpcode::G_FRINT:
753   case TargetOpcode::G_FNEARBYINT:
754   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
755     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
756     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
757       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
758       return UnableToLegalize;
759     }
760     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
761     if (Status != Legalized)
762       return Status;
763     break;
764   }
765   case TargetOpcode::G_FPEXT:
766   case TargetOpcode::G_FPTRUNC: {
767     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
768     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
769     if (!FromTy || !ToTy)
770       return UnableToLegalize;
771     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
772     if (Status != Legalized)
773       return Status;
774     break;
775   }
776   case TargetOpcode::G_FPTOSI:
777   case TargetOpcode::G_FPTOUI: {
778     // FIXME: Support other types
779     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
780     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
781     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
782       return UnableToLegalize;
783     LegalizeResult Status = conversionLibcall(
784         MI, MIRBuilder,
785         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
786         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
787     if (Status != Legalized)
788       return Status;
789     break;
790   }
791   case TargetOpcode::G_SITOFP:
792   case TargetOpcode::G_UITOFP: {
793     // FIXME: Support other types
794     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
795     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
796     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
797       return UnableToLegalize;
798     LegalizeResult Status = conversionLibcall(
799         MI, MIRBuilder,
800         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
801         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
802     if (Status != Legalized)
803       return Status;
804     break;
805   }
806   case TargetOpcode::G_BZERO:
807   case TargetOpcode::G_MEMCPY:
808   case TargetOpcode::G_MEMMOVE:
809   case TargetOpcode::G_MEMSET: {
810     LegalizeResult Result =
811         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
812     if (Result != Legalized)
813       return Result;
814     MI.eraseFromParent();
815     return Result;
816   }
817   }
818 
819   MI.eraseFromParent();
820   return Legalized;
821 }
822 
823 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
824                                                               unsigned TypeIdx,
825                                                               LLT NarrowTy) {
826   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
827   uint64_t NarrowSize = NarrowTy.getSizeInBits();
828 
829   switch (MI.getOpcode()) {
830   default:
831     return UnableToLegalize;
832   case TargetOpcode::G_IMPLICIT_DEF: {
833     Register DstReg = MI.getOperand(0).getReg();
834     LLT DstTy = MRI.getType(DstReg);
835 
836     // If SizeOp0 is not an exact multiple of NarrowSize, emit
837     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
838     // FIXME: Although this would also be legal for the general case, it causes
839     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
840     //  combines not being hit). This seems to be a problem related to the
841     //  artifact combiner.
842     if (SizeOp0 % NarrowSize != 0) {
843       LLT ImplicitTy = NarrowTy;
844       if (DstTy.isVector())
845         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
846 
847       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
848       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
849 
850       MI.eraseFromParent();
851       return Legalized;
852     }
853 
854     int NumParts = SizeOp0 / NarrowSize;
855 
856     SmallVector<Register, 2> DstRegs;
857     for (int i = 0; i < NumParts; ++i)
858       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
859 
860     if (DstTy.isVector())
861       MIRBuilder.buildBuildVector(DstReg, DstRegs);
862     else
863       MIRBuilder.buildMerge(DstReg, DstRegs);
864     MI.eraseFromParent();
865     return Legalized;
866   }
867   case TargetOpcode::G_CONSTANT: {
868     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
869     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
870     unsigned TotalSize = Ty.getSizeInBits();
871     unsigned NarrowSize = NarrowTy.getSizeInBits();
872     int NumParts = TotalSize / NarrowSize;
873 
874     SmallVector<Register, 4> PartRegs;
875     for (int I = 0; I != NumParts; ++I) {
876       unsigned Offset = I * NarrowSize;
877       auto K = MIRBuilder.buildConstant(NarrowTy,
878                                         Val.lshr(Offset).trunc(NarrowSize));
879       PartRegs.push_back(K.getReg(0));
880     }
881 
882     LLT LeftoverTy;
883     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
884     SmallVector<Register, 1> LeftoverRegs;
885     if (LeftoverBits != 0) {
886       LeftoverTy = LLT::scalar(LeftoverBits);
887       auto K = MIRBuilder.buildConstant(
888         LeftoverTy,
889         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
890       LeftoverRegs.push_back(K.getReg(0));
891     }
892 
893     insertParts(MI.getOperand(0).getReg(),
894                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
895 
896     MI.eraseFromParent();
897     return Legalized;
898   }
899   case TargetOpcode::G_SEXT:
900   case TargetOpcode::G_ZEXT:
901   case TargetOpcode::G_ANYEXT:
902     return narrowScalarExt(MI, TypeIdx, NarrowTy);
903   case TargetOpcode::G_TRUNC: {
904     if (TypeIdx != 1)
905       return UnableToLegalize;
906 
907     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
908     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
909       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
910       return UnableToLegalize;
911     }
912 
913     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
914     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
915     MI.eraseFromParent();
916     return Legalized;
917   }
918 
919   case TargetOpcode::G_FREEZE:
920     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
921   case TargetOpcode::G_ADD:
922   case TargetOpcode::G_SUB:
923   case TargetOpcode::G_SADDO:
924   case TargetOpcode::G_SSUBO:
925   case TargetOpcode::G_SADDE:
926   case TargetOpcode::G_SSUBE:
927   case TargetOpcode::G_UADDO:
928   case TargetOpcode::G_USUBO:
929   case TargetOpcode::G_UADDE:
930   case TargetOpcode::G_USUBE:
931     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
932   case TargetOpcode::G_MUL:
933   case TargetOpcode::G_UMULH:
934     return narrowScalarMul(MI, NarrowTy);
935   case TargetOpcode::G_EXTRACT:
936     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
937   case TargetOpcode::G_INSERT:
938     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
939   case TargetOpcode::G_LOAD: {
940     auto &LoadMI = cast<GLoad>(MI);
941     Register DstReg = LoadMI.getDstReg();
942     LLT DstTy = MRI.getType(DstReg);
943     if (DstTy.isVector())
944       return UnableToLegalize;
945 
946     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
947       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
948       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
949       MIRBuilder.buildAnyExt(DstReg, TmpReg);
950       LoadMI.eraseFromParent();
951       return Legalized;
952     }
953 
954     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
955   }
956   case TargetOpcode::G_ZEXTLOAD:
957   case TargetOpcode::G_SEXTLOAD: {
958     auto &LoadMI = cast<GExtLoad>(MI);
959     Register DstReg = LoadMI.getDstReg();
960     Register PtrReg = LoadMI.getPointerReg();
961 
962     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
963     auto &MMO = LoadMI.getMMO();
964     unsigned MemSize = MMO.getSizeInBits();
965 
966     if (MemSize == NarrowSize) {
967       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
968     } else if (MemSize < NarrowSize) {
969       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
970     } else if (MemSize > NarrowSize) {
971       // FIXME: Need to split the load.
972       return UnableToLegalize;
973     }
974 
975     if (isa<GZExtLoad>(LoadMI))
976       MIRBuilder.buildZExt(DstReg, TmpReg);
977     else
978       MIRBuilder.buildSExt(DstReg, TmpReg);
979 
980     LoadMI.eraseFromParent();
981     return Legalized;
982   }
983   case TargetOpcode::G_STORE: {
984     auto &StoreMI = cast<GStore>(MI);
985 
986     Register SrcReg = StoreMI.getValueReg();
987     LLT SrcTy = MRI.getType(SrcReg);
988     if (SrcTy.isVector())
989       return UnableToLegalize;
990 
991     int NumParts = SizeOp0 / NarrowSize;
992     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
993     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
994     if (SrcTy.isVector() && LeftoverBits != 0)
995       return UnableToLegalize;
996 
997     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
998       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
999       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1000       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1001       StoreMI.eraseFromParent();
1002       return Legalized;
1003     }
1004 
1005     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1006   }
1007   case TargetOpcode::G_SELECT:
1008     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1009   case TargetOpcode::G_AND:
1010   case TargetOpcode::G_OR:
1011   case TargetOpcode::G_XOR: {
1012     // Legalize bitwise operation:
1013     // A = BinOp<Ty> B, C
1014     // into:
1015     // B1, ..., BN = G_UNMERGE_VALUES B
1016     // C1, ..., CN = G_UNMERGE_VALUES C
1017     // A1 = BinOp<Ty/N> B1, C2
1018     // ...
1019     // AN = BinOp<Ty/N> BN, CN
1020     // A = G_MERGE_VALUES A1, ..., AN
1021     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1022   }
1023   case TargetOpcode::G_SHL:
1024   case TargetOpcode::G_LSHR:
1025   case TargetOpcode::G_ASHR:
1026     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1027   case TargetOpcode::G_CTLZ:
1028   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1029   case TargetOpcode::G_CTTZ:
1030   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1031   case TargetOpcode::G_CTPOP:
1032     if (TypeIdx == 1)
1033       switch (MI.getOpcode()) {
1034       case TargetOpcode::G_CTLZ:
1035       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1036         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1037       case TargetOpcode::G_CTTZ:
1038       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1039         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1040       case TargetOpcode::G_CTPOP:
1041         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1042       default:
1043         return UnableToLegalize;
1044       }
1045 
1046     Observer.changingInstr(MI);
1047     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1048     Observer.changedInstr(MI);
1049     return Legalized;
1050   case TargetOpcode::G_INTTOPTR:
1051     if (TypeIdx != 1)
1052       return UnableToLegalize;
1053 
1054     Observer.changingInstr(MI);
1055     narrowScalarSrc(MI, NarrowTy, 1);
1056     Observer.changedInstr(MI);
1057     return Legalized;
1058   case TargetOpcode::G_PTRTOINT:
1059     if (TypeIdx != 0)
1060       return UnableToLegalize;
1061 
1062     Observer.changingInstr(MI);
1063     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1064     Observer.changedInstr(MI);
1065     return Legalized;
1066   case TargetOpcode::G_PHI: {
1067     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1068     // NarrowSize.
1069     if (SizeOp0 % NarrowSize != 0)
1070       return UnableToLegalize;
1071 
1072     unsigned NumParts = SizeOp0 / NarrowSize;
1073     SmallVector<Register, 2> DstRegs(NumParts);
1074     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1075     Observer.changingInstr(MI);
1076     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1077       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1078       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1079       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1080                    SrcRegs[i / 2]);
1081     }
1082     MachineBasicBlock &MBB = *MI.getParent();
1083     MIRBuilder.setInsertPt(MBB, MI);
1084     for (unsigned i = 0; i < NumParts; ++i) {
1085       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1086       MachineInstrBuilder MIB =
1087           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1088       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1089         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1090     }
1091     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1092     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1093     Observer.changedInstr(MI);
1094     MI.eraseFromParent();
1095     return Legalized;
1096   }
1097   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1098   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1099     if (TypeIdx != 2)
1100       return UnableToLegalize;
1101 
1102     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1103     Observer.changingInstr(MI);
1104     narrowScalarSrc(MI, NarrowTy, OpIdx);
1105     Observer.changedInstr(MI);
1106     return Legalized;
1107   }
1108   case TargetOpcode::G_ICMP: {
1109     Register LHS = MI.getOperand(2).getReg();
1110     LLT SrcTy = MRI.getType(LHS);
1111     uint64_t SrcSize = SrcTy.getSizeInBits();
1112     CmpInst::Predicate Pred =
1113         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1114 
1115     // TODO: Handle the non-equality case for weird sizes.
1116     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1117       return UnableToLegalize;
1118 
1119     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1120     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1121     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1122                       LHSLeftoverRegs))
1123       return UnableToLegalize;
1124 
1125     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1126     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1127     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1128                       RHSPartRegs, RHSLeftoverRegs))
1129       return UnableToLegalize;
1130 
1131     // We now have the LHS and RHS of the compare split into narrow-type
1132     // registers, plus potentially some leftover type.
1133     Register Dst = MI.getOperand(0).getReg();
1134     LLT ResTy = MRI.getType(Dst);
1135     if (ICmpInst::isEquality(Pred)) {
1136       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1137       // them together. For each equal part, the result should be all 0s. For
1138       // each non-equal part, we'll get at least one 1.
1139       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1140       SmallVector<Register, 4> Xors;
1141       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1142         auto LHS = std::get<0>(LHSAndRHS);
1143         auto RHS = std::get<1>(LHSAndRHS);
1144         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1145         Xors.push_back(Xor);
1146       }
1147 
1148       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1149       // to the desired narrow type so that we can OR them together later.
1150       SmallVector<Register, 4> WidenedXors;
1151       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1152         auto LHS = std::get<0>(LHSAndRHS);
1153         auto RHS = std::get<1>(LHSAndRHS);
1154         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1155         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1156         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1157                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1158         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1159       }
1160 
1161       // Now, for each part we broke up, we know if they are equal/not equal
1162       // based off the G_XOR. We can OR these all together and compare against
1163       // 0 to get the result.
1164       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1165       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1166       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1167         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1168       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1169     } else {
1170       // TODO: Handle non-power-of-two types.
1171       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1172       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1173       Register LHSL = LHSPartRegs[0];
1174       Register LHSH = LHSPartRegs[1];
1175       Register RHSL = RHSPartRegs[0];
1176       Register RHSH = RHSPartRegs[1];
1177       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1178       MachineInstrBuilder CmpHEQ =
1179           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1180       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1181           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1182       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1183     }
1184     MI.eraseFromParent();
1185     return Legalized;
1186   }
1187   case TargetOpcode::G_SEXT_INREG: {
1188     if (TypeIdx != 0)
1189       return UnableToLegalize;
1190 
1191     int64_t SizeInBits = MI.getOperand(2).getImm();
1192 
1193     // So long as the new type has more bits than the bits we're extending we
1194     // don't need to break it apart.
1195     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1196       Observer.changingInstr(MI);
1197       // We don't lose any non-extension bits by truncating the src and
1198       // sign-extending the dst.
1199       MachineOperand &MO1 = MI.getOperand(1);
1200       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1201       MO1.setReg(TruncMIB.getReg(0));
1202 
1203       MachineOperand &MO2 = MI.getOperand(0);
1204       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1205       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1206       MIRBuilder.buildSExt(MO2, DstExt);
1207       MO2.setReg(DstExt);
1208       Observer.changedInstr(MI);
1209       return Legalized;
1210     }
1211 
1212     // Break it apart. Components below the extension point are unmodified. The
1213     // component containing the extension point becomes a narrower SEXT_INREG.
1214     // Components above it are ashr'd from the component containing the
1215     // extension point.
1216     if (SizeOp0 % NarrowSize != 0)
1217       return UnableToLegalize;
1218     int NumParts = SizeOp0 / NarrowSize;
1219 
1220     // List the registers where the destination will be scattered.
1221     SmallVector<Register, 2> DstRegs;
1222     // List the registers where the source will be split.
1223     SmallVector<Register, 2> SrcRegs;
1224 
1225     // Create all the temporary registers.
1226     for (int i = 0; i < NumParts; ++i) {
1227       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1228 
1229       SrcRegs.push_back(SrcReg);
1230     }
1231 
1232     // Explode the big arguments into smaller chunks.
1233     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1234 
1235     Register AshrCstReg =
1236         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1237             .getReg(0);
1238     Register FullExtensionReg = 0;
1239     Register PartialExtensionReg = 0;
1240 
1241     // Do the operation on each small part.
1242     for (int i = 0; i < NumParts; ++i) {
1243       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1244         DstRegs.push_back(SrcRegs[i]);
1245       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1246         assert(PartialExtensionReg &&
1247                "Expected to visit partial extension before full");
1248         if (FullExtensionReg) {
1249           DstRegs.push_back(FullExtensionReg);
1250           continue;
1251         }
1252         DstRegs.push_back(
1253             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1254                 .getReg(0));
1255         FullExtensionReg = DstRegs.back();
1256       } else {
1257         DstRegs.push_back(
1258             MIRBuilder
1259                 .buildInstr(
1260                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1261                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1262                 .getReg(0));
1263         PartialExtensionReg = DstRegs.back();
1264       }
1265     }
1266 
1267     // Gather the destination registers into the final destination.
1268     Register DstReg = MI.getOperand(0).getReg();
1269     MIRBuilder.buildMerge(DstReg, DstRegs);
1270     MI.eraseFromParent();
1271     return Legalized;
1272   }
1273   case TargetOpcode::G_BSWAP:
1274   case TargetOpcode::G_BITREVERSE: {
1275     if (SizeOp0 % NarrowSize != 0)
1276       return UnableToLegalize;
1277 
1278     Observer.changingInstr(MI);
1279     SmallVector<Register, 2> SrcRegs, DstRegs;
1280     unsigned NumParts = SizeOp0 / NarrowSize;
1281     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1282 
1283     for (unsigned i = 0; i < NumParts; ++i) {
1284       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1285                                            {SrcRegs[NumParts - 1 - i]});
1286       DstRegs.push_back(DstPart.getReg(0));
1287     }
1288 
1289     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1290 
1291     Observer.changedInstr(MI);
1292     MI.eraseFromParent();
1293     return Legalized;
1294   }
1295   case TargetOpcode::G_PTR_ADD:
1296   case TargetOpcode::G_PTRMASK: {
1297     if (TypeIdx != 1)
1298       return UnableToLegalize;
1299     Observer.changingInstr(MI);
1300     narrowScalarSrc(MI, NarrowTy, 2);
1301     Observer.changedInstr(MI);
1302     return Legalized;
1303   }
1304   case TargetOpcode::G_FPTOUI:
1305   case TargetOpcode::G_FPTOSI:
1306     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1307   case TargetOpcode::G_FPEXT:
1308     if (TypeIdx != 0)
1309       return UnableToLegalize;
1310     Observer.changingInstr(MI);
1311     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1312     Observer.changedInstr(MI);
1313     return Legalized;
1314   }
1315 }
1316 
1317 Register LegalizerHelper::coerceToScalar(Register Val) {
1318   LLT Ty = MRI.getType(Val);
1319   if (Ty.isScalar())
1320     return Val;
1321 
1322   const DataLayout &DL = MIRBuilder.getDataLayout();
1323   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1324   if (Ty.isPointer()) {
1325     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1326       return Register();
1327     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1328   }
1329 
1330   Register NewVal = Val;
1331 
1332   assert(Ty.isVector());
1333   LLT EltTy = Ty.getElementType();
1334   if (EltTy.isPointer())
1335     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1336   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1337 }
1338 
1339 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1340                                      unsigned OpIdx, unsigned ExtOpcode) {
1341   MachineOperand &MO = MI.getOperand(OpIdx);
1342   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1343   MO.setReg(ExtB.getReg(0));
1344 }
1345 
1346 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1347                                       unsigned OpIdx) {
1348   MachineOperand &MO = MI.getOperand(OpIdx);
1349   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1350   MO.setReg(ExtB.getReg(0));
1351 }
1352 
1353 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1354                                      unsigned OpIdx, unsigned TruncOpcode) {
1355   MachineOperand &MO = MI.getOperand(OpIdx);
1356   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1357   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1358   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1359   MO.setReg(DstExt);
1360 }
1361 
1362 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1363                                       unsigned OpIdx, unsigned ExtOpcode) {
1364   MachineOperand &MO = MI.getOperand(OpIdx);
1365   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1366   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1367   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1368   MO.setReg(DstTrunc);
1369 }
1370 
1371 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1372                                             unsigned OpIdx) {
1373   MachineOperand &MO = MI.getOperand(OpIdx);
1374   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1375   MO.setReg(widenWithUnmerge(WideTy, MO.getReg()));
1376 }
1377 
1378 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1379                                             unsigned OpIdx) {
1380   MachineOperand &MO = MI.getOperand(OpIdx);
1381 
1382   LLT OldTy = MRI.getType(MO.getReg());
1383   unsigned OldElts = OldTy.getNumElements();
1384   unsigned NewElts = MoreTy.getNumElements();
1385 
1386   unsigned NumParts = NewElts / OldElts;
1387 
1388   // Use concat_vectors if the result is a multiple of the number of elements.
1389   if (NumParts * OldElts == NewElts) {
1390     SmallVector<Register, 8> Parts;
1391     Parts.push_back(MO.getReg());
1392 
1393     Register ImpDef = MIRBuilder.buildUndef(OldTy).getReg(0);
1394     for (unsigned I = 1; I != NumParts; ++I)
1395       Parts.push_back(ImpDef);
1396 
1397     auto Concat = MIRBuilder.buildConcatVectors(MoreTy, Parts);
1398     MO.setReg(Concat.getReg(0));
1399     return;
1400   }
1401 
1402   Register MoreReg = MRI.createGenericVirtualRegister(MoreTy);
1403   Register ImpDef = MIRBuilder.buildUndef(MoreTy).getReg(0);
1404   MIRBuilder.buildInsert(MoreReg, ImpDef, MO.getReg(), 0);
1405   MO.setReg(MoreReg);
1406 }
1407 
1408 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1409   MachineOperand &Op = MI.getOperand(OpIdx);
1410   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1411 }
1412 
1413 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1414   MachineOperand &MO = MI.getOperand(OpIdx);
1415   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1416   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1417   MIRBuilder.buildBitcast(MO, CastDst);
1418   MO.setReg(CastDst);
1419 }
1420 
1421 LegalizerHelper::LegalizeResult
1422 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1423                                         LLT WideTy) {
1424   if (TypeIdx != 1)
1425     return UnableToLegalize;
1426 
1427   Register DstReg = MI.getOperand(0).getReg();
1428   LLT DstTy = MRI.getType(DstReg);
1429   if (DstTy.isVector())
1430     return UnableToLegalize;
1431 
1432   Register Src1 = MI.getOperand(1).getReg();
1433   LLT SrcTy = MRI.getType(Src1);
1434   const int DstSize = DstTy.getSizeInBits();
1435   const int SrcSize = SrcTy.getSizeInBits();
1436   const int WideSize = WideTy.getSizeInBits();
1437   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1438 
1439   unsigned NumOps = MI.getNumOperands();
1440   unsigned NumSrc = MI.getNumOperands() - 1;
1441   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1442 
1443   if (WideSize >= DstSize) {
1444     // Directly pack the bits in the target type.
1445     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1446 
1447     for (unsigned I = 2; I != NumOps; ++I) {
1448       const unsigned Offset = (I - 1) * PartSize;
1449 
1450       Register SrcReg = MI.getOperand(I).getReg();
1451       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1452 
1453       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1454 
1455       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1456         MRI.createGenericVirtualRegister(WideTy);
1457 
1458       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1459       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1460       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1461       ResultReg = NextResult;
1462     }
1463 
1464     if (WideSize > DstSize)
1465       MIRBuilder.buildTrunc(DstReg, ResultReg);
1466     else if (DstTy.isPointer())
1467       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1468 
1469     MI.eraseFromParent();
1470     return Legalized;
1471   }
1472 
1473   // Unmerge the original values to the GCD type, and recombine to the next
1474   // multiple greater than the original type.
1475   //
1476   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1477   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1478   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1479   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1480   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1481   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1482   // %12:_(s12) = G_MERGE_VALUES %10, %11
1483   //
1484   // Padding with undef if necessary:
1485   //
1486   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1487   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1488   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1489   // %7:_(s2) = G_IMPLICIT_DEF
1490   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1491   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1492   // %10:_(s12) = G_MERGE_VALUES %8, %9
1493 
1494   const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1495   LLT GCDTy = LLT::scalar(GCD);
1496 
1497   SmallVector<Register, 8> Parts;
1498   SmallVector<Register, 8> NewMergeRegs;
1499   SmallVector<Register, 8> Unmerges;
1500   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1501 
1502   // Decompose the original operands if they don't evenly divide.
1503   for (int I = 1, E = MI.getNumOperands(); I != E; ++I) {
1504     Register SrcReg = MI.getOperand(I).getReg();
1505     if (GCD == SrcSize) {
1506       Unmerges.push_back(SrcReg);
1507     } else {
1508       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1509       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1510         Unmerges.push_back(Unmerge.getReg(J));
1511     }
1512   }
1513 
1514   // Pad with undef to the next size that is a multiple of the requested size.
1515   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1516     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1517     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1518       Unmerges.push_back(UndefReg);
1519   }
1520 
1521   const int PartsPerGCD = WideSize / GCD;
1522 
1523   // Build merges of each piece.
1524   ArrayRef<Register> Slicer(Unmerges);
1525   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1526     auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1527     NewMergeRegs.push_back(Merge.getReg(0));
1528   }
1529 
1530   // A truncate may be necessary if the requested type doesn't evenly divide the
1531   // original result type.
1532   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1533     MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1534   } else {
1535     auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1536     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1537   }
1538 
1539   MI.eraseFromParent();
1540   return Legalized;
1541 }
1542 
1543 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
1544   Register WideReg = MRI.createGenericVirtualRegister(WideTy);
1545   LLT OrigTy = MRI.getType(OrigReg);
1546   LLT LCMTy = getLCMType(WideTy, OrigTy);
1547 
1548   const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
1549   const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
1550 
1551   Register UnmergeSrc = WideReg;
1552 
1553   // Create a merge to the LCM type, padding with undef
1554   // %0:_(<3 x s32>) = G_FOO => <4 x s32>
1555   // =>
1556   // %1:_(<4 x s32>) = G_FOO
1557   // %2:_(<4 x s32>) = G_IMPLICIT_DEF
1558   // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
1559   // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
1560   if (NumMergeParts > 1) {
1561     Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
1562     SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
1563     MergeParts[0] = WideReg;
1564     UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
1565   }
1566 
1567   // Unmerge to the original register and pad with dead defs.
1568   SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
1569   UnmergeResults[0] = OrigReg;
1570   for (int I = 1; I != NumUnmergeParts; ++I)
1571     UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
1572 
1573   MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
1574   return WideReg;
1575 }
1576 
1577 LegalizerHelper::LegalizeResult
1578 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1579                                           LLT WideTy) {
1580   if (TypeIdx != 0)
1581     return UnableToLegalize;
1582 
1583   int NumDst = MI.getNumOperands() - 1;
1584   Register SrcReg = MI.getOperand(NumDst).getReg();
1585   LLT SrcTy = MRI.getType(SrcReg);
1586   if (SrcTy.isVector())
1587     return UnableToLegalize;
1588 
1589   Register Dst0Reg = MI.getOperand(0).getReg();
1590   LLT DstTy = MRI.getType(Dst0Reg);
1591   if (!DstTy.isScalar())
1592     return UnableToLegalize;
1593 
1594   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1595     if (SrcTy.isPointer()) {
1596       const DataLayout &DL = MIRBuilder.getDataLayout();
1597       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1598         LLVM_DEBUG(
1599             dbgs() << "Not casting non-integral address space integer\n");
1600         return UnableToLegalize;
1601       }
1602 
1603       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1604       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1605     }
1606 
1607     // Widen SrcTy to WideTy. This does not affect the result, but since the
1608     // user requested this size, it is probably better handled than SrcTy and
1609     // should reduce the total number of legalization artifacts
1610     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1611       SrcTy = WideTy;
1612       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1613     }
1614 
1615     // Theres no unmerge type to target. Directly extract the bits from the
1616     // source type
1617     unsigned DstSize = DstTy.getSizeInBits();
1618 
1619     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1620     for (int I = 1; I != NumDst; ++I) {
1621       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1622       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1623       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1624     }
1625 
1626     MI.eraseFromParent();
1627     return Legalized;
1628   }
1629 
1630   // Extend the source to a wider type.
1631   LLT LCMTy = getLCMType(SrcTy, WideTy);
1632 
1633   Register WideSrc = SrcReg;
1634   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1635     // TODO: If this is an integral address space, cast to integer and anyext.
1636     if (SrcTy.isPointer()) {
1637       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1638       return UnableToLegalize;
1639     }
1640 
1641     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1642   }
1643 
1644   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1645 
1646   // Create a sequence of unmerges and merges to the original results. Since we
1647   // may have widened the source, we will need to pad the results with dead defs
1648   // to cover the source register.
1649   // e.g. widen s48 to s64:
1650   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1651   //
1652   // =>
1653   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1654   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1655   //  ; unpack to GCD type, with extra dead defs
1656   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1657   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1658   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1659   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1660   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1661   const LLT GCDTy = getGCDType(WideTy, DstTy);
1662   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1663   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1664 
1665   // Directly unmerge to the destination without going through a GCD type
1666   // if possible
1667   if (PartsPerRemerge == 1) {
1668     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1669 
1670     for (int I = 0; I != NumUnmerge; ++I) {
1671       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1672 
1673       for (int J = 0; J != PartsPerUnmerge; ++J) {
1674         int Idx = I * PartsPerUnmerge + J;
1675         if (Idx < NumDst)
1676           MIB.addDef(MI.getOperand(Idx).getReg());
1677         else {
1678           // Create dead def for excess components.
1679           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1680         }
1681       }
1682 
1683       MIB.addUse(Unmerge.getReg(I));
1684     }
1685   } else {
1686     SmallVector<Register, 16> Parts;
1687     for (int J = 0; J != NumUnmerge; ++J)
1688       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1689 
1690     SmallVector<Register, 8> RemergeParts;
1691     for (int I = 0; I != NumDst; ++I) {
1692       for (int J = 0; J < PartsPerRemerge; ++J) {
1693         const int Idx = I * PartsPerRemerge + J;
1694         RemergeParts.emplace_back(Parts[Idx]);
1695       }
1696 
1697       MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1698       RemergeParts.clear();
1699     }
1700   }
1701 
1702   MI.eraseFromParent();
1703   return Legalized;
1704 }
1705 
1706 LegalizerHelper::LegalizeResult
1707 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1708                                     LLT WideTy) {
1709   Register DstReg = MI.getOperand(0).getReg();
1710   Register SrcReg = MI.getOperand(1).getReg();
1711   LLT SrcTy = MRI.getType(SrcReg);
1712 
1713   LLT DstTy = MRI.getType(DstReg);
1714   unsigned Offset = MI.getOperand(2).getImm();
1715 
1716   if (TypeIdx == 0) {
1717     if (SrcTy.isVector() || DstTy.isVector())
1718       return UnableToLegalize;
1719 
1720     SrcOp Src(SrcReg);
1721     if (SrcTy.isPointer()) {
1722       // Extracts from pointers can be handled only if they are really just
1723       // simple integers.
1724       const DataLayout &DL = MIRBuilder.getDataLayout();
1725       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1726         return UnableToLegalize;
1727 
1728       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1729       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1730       SrcTy = SrcAsIntTy;
1731     }
1732 
1733     if (DstTy.isPointer())
1734       return UnableToLegalize;
1735 
1736     if (Offset == 0) {
1737       // Avoid a shift in the degenerate case.
1738       MIRBuilder.buildTrunc(DstReg,
1739                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1740       MI.eraseFromParent();
1741       return Legalized;
1742     }
1743 
1744     // Do a shift in the source type.
1745     LLT ShiftTy = SrcTy;
1746     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1747       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1748       ShiftTy = WideTy;
1749     }
1750 
1751     auto LShr = MIRBuilder.buildLShr(
1752       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1753     MIRBuilder.buildTrunc(DstReg, LShr);
1754     MI.eraseFromParent();
1755     return Legalized;
1756   }
1757 
1758   if (SrcTy.isScalar()) {
1759     Observer.changingInstr(MI);
1760     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1761     Observer.changedInstr(MI);
1762     return Legalized;
1763   }
1764 
1765   if (!SrcTy.isVector())
1766     return UnableToLegalize;
1767 
1768   if (DstTy != SrcTy.getElementType())
1769     return UnableToLegalize;
1770 
1771   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1772     return UnableToLegalize;
1773 
1774   Observer.changingInstr(MI);
1775   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1776 
1777   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1778                           Offset);
1779   widenScalarDst(MI, WideTy.getScalarType(), 0);
1780   Observer.changedInstr(MI);
1781   return Legalized;
1782 }
1783 
1784 LegalizerHelper::LegalizeResult
1785 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1786                                    LLT WideTy) {
1787   if (TypeIdx != 0 || WideTy.isVector())
1788     return UnableToLegalize;
1789   Observer.changingInstr(MI);
1790   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1791   widenScalarDst(MI, WideTy);
1792   Observer.changedInstr(MI);
1793   return Legalized;
1794 }
1795 
1796 LegalizerHelper::LegalizeResult
1797 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1798                                            LLT WideTy) {
1799   if (TypeIdx == 1)
1800     return UnableToLegalize; // TODO
1801 
1802   unsigned Opcode;
1803   unsigned ExtOpcode;
1804   Optional<Register> CarryIn = None;
1805   switch (MI.getOpcode()) {
1806   default:
1807     llvm_unreachable("Unexpected opcode!");
1808   case TargetOpcode::G_SADDO:
1809     Opcode = TargetOpcode::G_ADD;
1810     ExtOpcode = TargetOpcode::G_SEXT;
1811     break;
1812   case TargetOpcode::G_SSUBO:
1813     Opcode = TargetOpcode::G_SUB;
1814     ExtOpcode = TargetOpcode::G_SEXT;
1815     break;
1816   case TargetOpcode::G_UADDO:
1817     Opcode = TargetOpcode::G_ADD;
1818     ExtOpcode = TargetOpcode::G_ZEXT;
1819     break;
1820   case TargetOpcode::G_USUBO:
1821     Opcode = TargetOpcode::G_SUB;
1822     ExtOpcode = TargetOpcode::G_ZEXT;
1823     break;
1824   case TargetOpcode::G_SADDE:
1825     Opcode = TargetOpcode::G_UADDE;
1826     ExtOpcode = TargetOpcode::G_SEXT;
1827     CarryIn = MI.getOperand(4).getReg();
1828     break;
1829   case TargetOpcode::G_SSUBE:
1830     Opcode = TargetOpcode::G_USUBE;
1831     ExtOpcode = TargetOpcode::G_SEXT;
1832     CarryIn = MI.getOperand(4).getReg();
1833     break;
1834   case TargetOpcode::G_UADDE:
1835     Opcode = TargetOpcode::G_UADDE;
1836     ExtOpcode = TargetOpcode::G_ZEXT;
1837     CarryIn = MI.getOperand(4).getReg();
1838     break;
1839   case TargetOpcode::G_USUBE:
1840     Opcode = TargetOpcode::G_USUBE;
1841     ExtOpcode = TargetOpcode::G_ZEXT;
1842     CarryIn = MI.getOperand(4).getReg();
1843     break;
1844   }
1845 
1846   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1847   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1848   // Do the arithmetic in the larger type.
1849   Register NewOp;
1850   if (CarryIn) {
1851     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1852     NewOp = MIRBuilder
1853                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1854                             {LHSExt, RHSExt, *CarryIn})
1855                 .getReg(0);
1856   } else {
1857     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1858   }
1859   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1860   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1861   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1862   // There is no overflow if the ExtOp is the same as NewOp.
1863   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1864   // Now trunc the NewOp to the original result.
1865   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1866   MI.eraseFromParent();
1867   return Legalized;
1868 }
1869 
1870 LegalizerHelper::LegalizeResult
1871 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1872                                          LLT WideTy) {
1873   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1874                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1875                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1876   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1877                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1878   // We can convert this to:
1879   //   1. Any extend iN to iM
1880   //   2. SHL by M-N
1881   //   3. [US][ADD|SUB|SHL]SAT
1882   //   4. L/ASHR by M-N
1883   //
1884   // It may be more efficient to lower this to a min and a max operation in
1885   // the higher precision arithmetic if the promoted operation isn't legal,
1886   // but this decision is up to the target's lowering request.
1887   Register DstReg = MI.getOperand(0).getReg();
1888 
1889   unsigned NewBits = WideTy.getScalarSizeInBits();
1890   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1891 
1892   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1893   // must not left shift the RHS to preserve the shift amount.
1894   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1895   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1896                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1897   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1898   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1899   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1900 
1901   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1902                                         {ShiftL, ShiftR}, MI.getFlags());
1903 
1904   // Use a shift that will preserve the number of sign bits when the trunc is
1905   // folded away.
1906   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1907                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1908 
1909   MIRBuilder.buildTrunc(DstReg, Result);
1910   MI.eraseFromParent();
1911   return Legalized;
1912 }
1913 
1914 LegalizerHelper::LegalizeResult
1915 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1916                                  LLT WideTy) {
1917   if (TypeIdx == 1)
1918     return UnableToLegalize;
1919 
1920   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1921   Register Result = MI.getOperand(0).getReg();
1922   Register OriginalOverflow = MI.getOperand(1).getReg();
1923   Register LHS = MI.getOperand(2).getReg();
1924   Register RHS = MI.getOperand(3).getReg();
1925   LLT SrcTy = MRI.getType(LHS);
1926   LLT OverflowTy = MRI.getType(OriginalOverflow);
1927   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1928 
1929   // To determine if the result overflowed in the larger type, we extend the
1930   // input to the larger type, do the multiply (checking if it overflows),
1931   // then also check the high bits of the result to see if overflow happened
1932   // there.
1933   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
1934   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
1935   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
1936 
1937   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
1938                                     {LeftOperand, RightOperand});
1939   auto Mul = Mulo->getOperand(0);
1940   MIRBuilder.buildTrunc(Result, Mul);
1941 
1942   MachineInstrBuilder ExtResult;
1943   // Overflow occurred if it occurred in the larger type, or if the high part
1944   // of the result does not zero/sign-extend the low part.  Check this second
1945   // possibility first.
1946   if (IsSigned) {
1947     // For signed, overflow occurred when the high part does not sign-extend
1948     // the low part.
1949     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
1950   } else {
1951     // Unsigned overflow occurred when the high part does not zero-extend the
1952     // low part.
1953     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
1954   }
1955 
1956   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
1957   // so we don't need to check the overflow result of larger type Mulo.
1958   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
1959     auto Overflow =
1960         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
1961     // Finally check if the multiplication in the larger type itself overflowed.
1962     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
1963   } else {
1964     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
1965   }
1966   MI.eraseFromParent();
1967   return Legalized;
1968 }
1969 
1970 LegalizerHelper::LegalizeResult
1971 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
1972   switch (MI.getOpcode()) {
1973   default:
1974     return UnableToLegalize;
1975   case TargetOpcode::G_ATOMICRMW_XCHG:
1976   case TargetOpcode::G_ATOMICRMW_ADD:
1977   case TargetOpcode::G_ATOMICRMW_SUB:
1978   case TargetOpcode::G_ATOMICRMW_AND:
1979   case TargetOpcode::G_ATOMICRMW_OR:
1980   case TargetOpcode::G_ATOMICRMW_XOR:
1981   case TargetOpcode::G_ATOMICRMW_MIN:
1982   case TargetOpcode::G_ATOMICRMW_MAX:
1983   case TargetOpcode::G_ATOMICRMW_UMIN:
1984   case TargetOpcode::G_ATOMICRMW_UMAX:
1985     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
1986     Observer.changingInstr(MI);
1987     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
1988     widenScalarDst(MI, WideTy, 0);
1989     Observer.changedInstr(MI);
1990     return Legalized;
1991   case TargetOpcode::G_ATOMIC_CMPXCHG:
1992     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
1993     Observer.changingInstr(MI);
1994     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
1995     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
1996     widenScalarDst(MI, WideTy, 0);
1997     Observer.changedInstr(MI);
1998     return Legalized;
1999   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2000     if (TypeIdx == 0) {
2001       Observer.changingInstr(MI);
2002       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2003       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2004       widenScalarDst(MI, WideTy, 0);
2005       Observer.changedInstr(MI);
2006       return Legalized;
2007     }
2008     assert(TypeIdx == 1 &&
2009            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2010     Observer.changingInstr(MI);
2011     widenScalarDst(MI, WideTy, 1);
2012     Observer.changedInstr(MI);
2013     return Legalized;
2014   case TargetOpcode::G_EXTRACT:
2015     return widenScalarExtract(MI, TypeIdx, WideTy);
2016   case TargetOpcode::G_INSERT:
2017     return widenScalarInsert(MI, TypeIdx, WideTy);
2018   case TargetOpcode::G_MERGE_VALUES:
2019     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2020   case TargetOpcode::G_UNMERGE_VALUES:
2021     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2022   case TargetOpcode::G_SADDO:
2023   case TargetOpcode::G_SSUBO:
2024   case TargetOpcode::G_UADDO:
2025   case TargetOpcode::G_USUBO:
2026   case TargetOpcode::G_SADDE:
2027   case TargetOpcode::G_SSUBE:
2028   case TargetOpcode::G_UADDE:
2029   case TargetOpcode::G_USUBE:
2030     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2031   case TargetOpcode::G_UMULO:
2032   case TargetOpcode::G_SMULO:
2033     return widenScalarMulo(MI, TypeIdx, WideTy);
2034   case TargetOpcode::G_SADDSAT:
2035   case TargetOpcode::G_SSUBSAT:
2036   case TargetOpcode::G_SSHLSAT:
2037   case TargetOpcode::G_UADDSAT:
2038   case TargetOpcode::G_USUBSAT:
2039   case TargetOpcode::G_USHLSAT:
2040     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2041   case TargetOpcode::G_CTTZ:
2042   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2043   case TargetOpcode::G_CTLZ:
2044   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2045   case TargetOpcode::G_CTPOP: {
2046     if (TypeIdx == 0) {
2047       Observer.changingInstr(MI);
2048       widenScalarDst(MI, WideTy, 0);
2049       Observer.changedInstr(MI);
2050       return Legalized;
2051     }
2052 
2053     Register SrcReg = MI.getOperand(1).getReg();
2054 
2055     // First extend the input.
2056     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2057                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2058                           ? TargetOpcode::G_ANYEXT
2059                           : TargetOpcode::G_ZEXT;
2060     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2061     LLT CurTy = MRI.getType(SrcReg);
2062     unsigned NewOpc = MI.getOpcode();
2063     if (NewOpc == TargetOpcode::G_CTTZ) {
2064       // The count is the same in the larger type except if the original
2065       // value was zero.  This can be handled by setting the bit just off
2066       // the top of the original type.
2067       auto TopBit =
2068           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2069       MIBSrc = MIRBuilder.buildOr(
2070         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2071       // Now we know the operand is non-zero, use the more relaxed opcode.
2072       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2073     }
2074 
2075     // Perform the operation at the larger size.
2076     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2077     // This is already the correct result for CTPOP and CTTZs
2078     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2079         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2080       // The correct result is NewOp - (Difference in widety and current ty).
2081       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2082       MIBNewOp = MIRBuilder.buildSub(
2083           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2084     }
2085 
2086     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2087     MI.eraseFromParent();
2088     return Legalized;
2089   }
2090   case TargetOpcode::G_BSWAP: {
2091     Observer.changingInstr(MI);
2092     Register DstReg = MI.getOperand(0).getReg();
2093 
2094     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2095     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2096     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2097     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2098 
2099     MI.getOperand(0).setReg(DstExt);
2100 
2101     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2102 
2103     LLT Ty = MRI.getType(DstReg);
2104     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2105     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2106     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2107 
2108     MIRBuilder.buildTrunc(DstReg, ShrReg);
2109     Observer.changedInstr(MI);
2110     return Legalized;
2111   }
2112   case TargetOpcode::G_BITREVERSE: {
2113     Observer.changingInstr(MI);
2114 
2115     Register DstReg = MI.getOperand(0).getReg();
2116     LLT Ty = MRI.getType(DstReg);
2117     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2118 
2119     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2120     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2121     MI.getOperand(0).setReg(DstExt);
2122     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2123 
2124     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2125     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2126     MIRBuilder.buildTrunc(DstReg, Shift);
2127     Observer.changedInstr(MI);
2128     return Legalized;
2129   }
2130   case TargetOpcode::G_FREEZE:
2131     Observer.changingInstr(MI);
2132     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2133     widenScalarDst(MI, WideTy);
2134     Observer.changedInstr(MI);
2135     return Legalized;
2136 
2137   case TargetOpcode::G_ABS:
2138     Observer.changingInstr(MI);
2139     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2140     widenScalarDst(MI, WideTy);
2141     Observer.changedInstr(MI);
2142     return Legalized;
2143 
2144   case TargetOpcode::G_ADD:
2145   case TargetOpcode::G_AND:
2146   case TargetOpcode::G_MUL:
2147   case TargetOpcode::G_OR:
2148   case TargetOpcode::G_XOR:
2149   case TargetOpcode::G_SUB:
2150     // Perform operation at larger width (any extension is fines here, high bits
2151     // don't affect the result) and then truncate the result back to the
2152     // original type.
2153     Observer.changingInstr(MI);
2154     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2155     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2156     widenScalarDst(MI, WideTy);
2157     Observer.changedInstr(MI);
2158     return Legalized;
2159 
2160   case TargetOpcode::G_SBFX:
2161   case TargetOpcode::G_UBFX:
2162     Observer.changingInstr(MI);
2163 
2164     if (TypeIdx == 0) {
2165       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2166       widenScalarDst(MI, WideTy);
2167     } else {
2168       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2169       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2170     }
2171 
2172     Observer.changedInstr(MI);
2173     return Legalized;
2174 
2175   case TargetOpcode::G_SHL:
2176     Observer.changingInstr(MI);
2177 
2178     if (TypeIdx == 0) {
2179       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2180       widenScalarDst(MI, WideTy);
2181     } else {
2182       assert(TypeIdx == 1);
2183       // The "number of bits to shift" operand must preserve its value as an
2184       // unsigned integer:
2185       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2186     }
2187 
2188     Observer.changedInstr(MI);
2189     return Legalized;
2190 
2191   case TargetOpcode::G_SDIV:
2192   case TargetOpcode::G_SREM:
2193   case TargetOpcode::G_SMIN:
2194   case TargetOpcode::G_SMAX:
2195     Observer.changingInstr(MI);
2196     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2197     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2198     widenScalarDst(MI, WideTy);
2199     Observer.changedInstr(MI);
2200     return Legalized;
2201 
2202   case TargetOpcode::G_SDIVREM:
2203     Observer.changingInstr(MI);
2204     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2205     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2206     widenScalarDst(MI, WideTy);
2207     widenScalarDst(MI, WideTy, 1);
2208     Observer.changedInstr(MI);
2209     return Legalized;
2210 
2211   case TargetOpcode::G_ASHR:
2212   case TargetOpcode::G_LSHR:
2213     Observer.changingInstr(MI);
2214 
2215     if (TypeIdx == 0) {
2216       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2217         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2218 
2219       widenScalarSrc(MI, WideTy, 1, CvtOp);
2220       widenScalarDst(MI, WideTy);
2221     } else {
2222       assert(TypeIdx == 1);
2223       // The "number of bits to shift" operand must preserve its value as an
2224       // unsigned integer:
2225       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2226     }
2227 
2228     Observer.changedInstr(MI);
2229     return Legalized;
2230   case TargetOpcode::G_UDIV:
2231   case TargetOpcode::G_UREM:
2232   case TargetOpcode::G_UMIN:
2233   case TargetOpcode::G_UMAX:
2234     Observer.changingInstr(MI);
2235     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2236     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2237     widenScalarDst(MI, WideTy);
2238     Observer.changedInstr(MI);
2239     return Legalized;
2240 
2241   case TargetOpcode::G_UDIVREM:
2242     Observer.changingInstr(MI);
2243     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2244     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2245     widenScalarDst(MI, WideTy);
2246     widenScalarDst(MI, WideTy, 1);
2247     Observer.changedInstr(MI);
2248     return Legalized;
2249 
2250   case TargetOpcode::G_SELECT:
2251     Observer.changingInstr(MI);
2252     if (TypeIdx == 0) {
2253       // Perform operation at larger width (any extension is fine here, high
2254       // bits don't affect the result) and then truncate the result back to the
2255       // original type.
2256       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2257       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2258       widenScalarDst(MI, WideTy);
2259     } else {
2260       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2261       // Explicit extension is required here since high bits affect the result.
2262       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2263     }
2264     Observer.changedInstr(MI);
2265     return Legalized;
2266 
2267   case TargetOpcode::G_FPTOSI:
2268   case TargetOpcode::G_FPTOUI:
2269     Observer.changingInstr(MI);
2270 
2271     if (TypeIdx == 0)
2272       widenScalarDst(MI, WideTy);
2273     else
2274       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2275 
2276     Observer.changedInstr(MI);
2277     return Legalized;
2278   case TargetOpcode::G_SITOFP:
2279     Observer.changingInstr(MI);
2280 
2281     if (TypeIdx == 0)
2282       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2283     else
2284       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2285 
2286     Observer.changedInstr(MI);
2287     return Legalized;
2288   case TargetOpcode::G_UITOFP:
2289     Observer.changingInstr(MI);
2290 
2291     if (TypeIdx == 0)
2292       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2293     else
2294       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2295 
2296     Observer.changedInstr(MI);
2297     return Legalized;
2298   case TargetOpcode::G_LOAD:
2299   case TargetOpcode::G_SEXTLOAD:
2300   case TargetOpcode::G_ZEXTLOAD:
2301     Observer.changingInstr(MI);
2302     widenScalarDst(MI, WideTy);
2303     Observer.changedInstr(MI);
2304     return Legalized;
2305 
2306   case TargetOpcode::G_STORE: {
2307     if (TypeIdx != 0)
2308       return UnableToLegalize;
2309 
2310     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2311     if (!Ty.isScalar())
2312       return UnableToLegalize;
2313 
2314     Observer.changingInstr(MI);
2315 
2316     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2317       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2318     widenScalarSrc(MI, WideTy, 0, ExtType);
2319 
2320     Observer.changedInstr(MI);
2321     return Legalized;
2322   }
2323   case TargetOpcode::G_CONSTANT: {
2324     MachineOperand &SrcMO = MI.getOperand(1);
2325     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2326     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2327         MRI.getType(MI.getOperand(0).getReg()));
2328     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2329             ExtOpc == TargetOpcode::G_ANYEXT) &&
2330            "Illegal Extend");
2331     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2332     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2333                            ? SrcVal.sext(WideTy.getSizeInBits())
2334                            : SrcVal.zext(WideTy.getSizeInBits());
2335     Observer.changingInstr(MI);
2336     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2337 
2338     widenScalarDst(MI, WideTy);
2339     Observer.changedInstr(MI);
2340     return Legalized;
2341   }
2342   case TargetOpcode::G_FCONSTANT: {
2343     MachineOperand &SrcMO = MI.getOperand(1);
2344     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2345     APFloat Val = SrcMO.getFPImm()->getValueAPF();
2346     bool LosesInfo;
2347     switch (WideTy.getSizeInBits()) {
2348     case 32:
2349       Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
2350                   &LosesInfo);
2351       break;
2352     case 64:
2353       Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
2354                   &LosesInfo);
2355       break;
2356     default:
2357       return UnableToLegalize;
2358     }
2359 
2360     assert(!LosesInfo && "extend should always be lossless");
2361 
2362     Observer.changingInstr(MI);
2363     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
2364 
2365     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2366     Observer.changedInstr(MI);
2367     return Legalized;
2368   }
2369   case TargetOpcode::G_IMPLICIT_DEF: {
2370     Observer.changingInstr(MI);
2371     widenScalarDst(MI, WideTy);
2372     Observer.changedInstr(MI);
2373     return Legalized;
2374   }
2375   case TargetOpcode::G_BRCOND:
2376     Observer.changingInstr(MI);
2377     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2378     Observer.changedInstr(MI);
2379     return Legalized;
2380 
2381   case TargetOpcode::G_FCMP:
2382     Observer.changingInstr(MI);
2383     if (TypeIdx == 0)
2384       widenScalarDst(MI, WideTy);
2385     else {
2386       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2387       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2388     }
2389     Observer.changedInstr(MI);
2390     return Legalized;
2391 
2392   case TargetOpcode::G_ICMP:
2393     Observer.changingInstr(MI);
2394     if (TypeIdx == 0)
2395       widenScalarDst(MI, WideTy);
2396     else {
2397       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2398                                MI.getOperand(1).getPredicate()))
2399                                ? TargetOpcode::G_SEXT
2400                                : TargetOpcode::G_ZEXT;
2401       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2402       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2403     }
2404     Observer.changedInstr(MI);
2405     return Legalized;
2406 
2407   case TargetOpcode::G_PTR_ADD:
2408     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2409     Observer.changingInstr(MI);
2410     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2411     Observer.changedInstr(MI);
2412     return Legalized;
2413 
2414   case TargetOpcode::G_PHI: {
2415     assert(TypeIdx == 0 && "Expecting only Idx 0");
2416 
2417     Observer.changingInstr(MI);
2418     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2419       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2420       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2421       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2422     }
2423 
2424     MachineBasicBlock &MBB = *MI.getParent();
2425     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2426     widenScalarDst(MI, WideTy);
2427     Observer.changedInstr(MI);
2428     return Legalized;
2429   }
2430   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2431     if (TypeIdx == 0) {
2432       Register VecReg = MI.getOperand(1).getReg();
2433       LLT VecTy = MRI.getType(VecReg);
2434       Observer.changingInstr(MI);
2435 
2436       widenScalarSrc(
2437           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2438           TargetOpcode::G_SEXT);
2439 
2440       widenScalarDst(MI, WideTy, 0);
2441       Observer.changedInstr(MI);
2442       return Legalized;
2443     }
2444 
2445     if (TypeIdx != 2)
2446       return UnableToLegalize;
2447     Observer.changingInstr(MI);
2448     // TODO: Probably should be zext
2449     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2450     Observer.changedInstr(MI);
2451     return Legalized;
2452   }
2453   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2454     if (TypeIdx == 1) {
2455       Observer.changingInstr(MI);
2456 
2457       Register VecReg = MI.getOperand(1).getReg();
2458       LLT VecTy = MRI.getType(VecReg);
2459       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2460 
2461       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2462       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2463       widenScalarDst(MI, WideVecTy, 0);
2464       Observer.changedInstr(MI);
2465       return Legalized;
2466     }
2467 
2468     if (TypeIdx == 2) {
2469       Observer.changingInstr(MI);
2470       // TODO: Probably should be zext
2471       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2472       Observer.changedInstr(MI);
2473       return Legalized;
2474     }
2475 
2476     return UnableToLegalize;
2477   }
2478   case TargetOpcode::G_FADD:
2479   case TargetOpcode::G_FMUL:
2480   case TargetOpcode::G_FSUB:
2481   case TargetOpcode::G_FMA:
2482   case TargetOpcode::G_FMAD:
2483   case TargetOpcode::G_FNEG:
2484   case TargetOpcode::G_FABS:
2485   case TargetOpcode::G_FCANONICALIZE:
2486   case TargetOpcode::G_FMINNUM:
2487   case TargetOpcode::G_FMAXNUM:
2488   case TargetOpcode::G_FMINNUM_IEEE:
2489   case TargetOpcode::G_FMAXNUM_IEEE:
2490   case TargetOpcode::G_FMINIMUM:
2491   case TargetOpcode::G_FMAXIMUM:
2492   case TargetOpcode::G_FDIV:
2493   case TargetOpcode::G_FREM:
2494   case TargetOpcode::G_FCEIL:
2495   case TargetOpcode::G_FFLOOR:
2496   case TargetOpcode::G_FCOS:
2497   case TargetOpcode::G_FSIN:
2498   case TargetOpcode::G_FLOG10:
2499   case TargetOpcode::G_FLOG:
2500   case TargetOpcode::G_FLOG2:
2501   case TargetOpcode::G_FRINT:
2502   case TargetOpcode::G_FNEARBYINT:
2503   case TargetOpcode::G_FSQRT:
2504   case TargetOpcode::G_FEXP:
2505   case TargetOpcode::G_FEXP2:
2506   case TargetOpcode::G_FPOW:
2507   case TargetOpcode::G_INTRINSIC_TRUNC:
2508   case TargetOpcode::G_INTRINSIC_ROUND:
2509   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2510     assert(TypeIdx == 0);
2511     Observer.changingInstr(MI);
2512 
2513     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2514       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2515 
2516     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2517     Observer.changedInstr(MI);
2518     return Legalized;
2519   case TargetOpcode::G_FPOWI: {
2520     if (TypeIdx != 0)
2521       return UnableToLegalize;
2522     Observer.changingInstr(MI);
2523     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2524     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2525     Observer.changedInstr(MI);
2526     return Legalized;
2527   }
2528   case TargetOpcode::G_INTTOPTR:
2529     if (TypeIdx != 1)
2530       return UnableToLegalize;
2531 
2532     Observer.changingInstr(MI);
2533     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2534     Observer.changedInstr(MI);
2535     return Legalized;
2536   case TargetOpcode::G_PTRTOINT:
2537     if (TypeIdx != 0)
2538       return UnableToLegalize;
2539 
2540     Observer.changingInstr(MI);
2541     widenScalarDst(MI, WideTy, 0);
2542     Observer.changedInstr(MI);
2543     return Legalized;
2544   case TargetOpcode::G_BUILD_VECTOR: {
2545     Observer.changingInstr(MI);
2546 
2547     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2548     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2549       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2550 
2551     // Avoid changing the result vector type if the source element type was
2552     // requested.
2553     if (TypeIdx == 1) {
2554       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2555     } else {
2556       widenScalarDst(MI, WideTy, 0);
2557     }
2558 
2559     Observer.changedInstr(MI);
2560     return Legalized;
2561   }
2562   case TargetOpcode::G_SEXT_INREG:
2563     if (TypeIdx != 0)
2564       return UnableToLegalize;
2565 
2566     Observer.changingInstr(MI);
2567     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2568     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2569     Observer.changedInstr(MI);
2570     return Legalized;
2571   case TargetOpcode::G_PTRMASK: {
2572     if (TypeIdx != 1)
2573       return UnableToLegalize;
2574     Observer.changingInstr(MI);
2575     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2576     Observer.changedInstr(MI);
2577     return Legalized;
2578   }
2579   }
2580 }
2581 
2582 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2583                              MachineIRBuilder &B, Register Src, LLT Ty) {
2584   auto Unmerge = B.buildUnmerge(Ty, Src);
2585   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2586     Pieces.push_back(Unmerge.getReg(I));
2587 }
2588 
2589 LegalizerHelper::LegalizeResult
2590 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2591   Register Dst = MI.getOperand(0).getReg();
2592   Register Src = MI.getOperand(1).getReg();
2593   LLT DstTy = MRI.getType(Dst);
2594   LLT SrcTy = MRI.getType(Src);
2595 
2596   if (SrcTy.isVector()) {
2597     LLT SrcEltTy = SrcTy.getElementType();
2598     SmallVector<Register, 8> SrcRegs;
2599 
2600     if (DstTy.isVector()) {
2601       int NumDstElt = DstTy.getNumElements();
2602       int NumSrcElt = SrcTy.getNumElements();
2603 
2604       LLT DstEltTy = DstTy.getElementType();
2605       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2606       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2607 
2608       // If there's an element size mismatch, insert intermediate casts to match
2609       // the result element type.
2610       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2611         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2612         //
2613         // =>
2614         //
2615         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2616         // %3:_(<2 x s8>) = G_BITCAST %2
2617         // %4:_(<2 x s8>) = G_BITCAST %3
2618         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2619         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2620         SrcPartTy = SrcEltTy;
2621       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2622         //
2623         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2624         //
2625         // =>
2626         //
2627         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2628         // %3:_(s16) = G_BITCAST %2
2629         // %4:_(s16) = G_BITCAST %3
2630         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2631         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2632         DstCastTy = DstEltTy;
2633       }
2634 
2635       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2636       for (Register &SrcReg : SrcRegs)
2637         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2638     } else
2639       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2640 
2641     MIRBuilder.buildMerge(Dst, SrcRegs);
2642     MI.eraseFromParent();
2643     return Legalized;
2644   }
2645 
2646   if (DstTy.isVector()) {
2647     SmallVector<Register, 8> SrcRegs;
2648     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2649     MIRBuilder.buildMerge(Dst, SrcRegs);
2650     MI.eraseFromParent();
2651     return Legalized;
2652   }
2653 
2654   return UnableToLegalize;
2655 }
2656 
2657 /// Figure out the bit offset into a register when coercing a vector index for
2658 /// the wide element type. This is only for the case when promoting vector to
2659 /// one with larger elements.
2660 //
2661 ///
2662 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2663 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2664 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2665                                                    Register Idx,
2666                                                    unsigned NewEltSize,
2667                                                    unsigned OldEltSize) {
2668   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2669   LLT IdxTy = B.getMRI()->getType(Idx);
2670 
2671   // Now figure out the amount we need to shift to get the target bits.
2672   auto OffsetMask = B.buildConstant(
2673     IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
2674   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2675   return B.buildShl(IdxTy, OffsetIdx,
2676                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2677 }
2678 
2679 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2680 /// is casting to a vector with a smaller element size, perform multiple element
2681 /// extracts and merge the results. If this is coercing to a vector with larger
2682 /// elements, index the bitcasted vector and extract the target element with bit
2683 /// operations. This is intended to force the indexing in the native register
2684 /// size for architectures that can dynamically index the register file.
2685 LegalizerHelper::LegalizeResult
2686 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2687                                          LLT CastTy) {
2688   if (TypeIdx != 1)
2689     return UnableToLegalize;
2690 
2691   Register Dst = MI.getOperand(0).getReg();
2692   Register SrcVec = MI.getOperand(1).getReg();
2693   Register Idx = MI.getOperand(2).getReg();
2694   LLT SrcVecTy = MRI.getType(SrcVec);
2695   LLT IdxTy = MRI.getType(Idx);
2696 
2697   LLT SrcEltTy = SrcVecTy.getElementType();
2698   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2699   unsigned OldNumElts = SrcVecTy.getNumElements();
2700 
2701   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2702   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2703 
2704   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2705   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2706   if (NewNumElts > OldNumElts) {
2707     // Decreasing the vector element size
2708     //
2709     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2710     //  =>
2711     //  v4i32:castx = bitcast x:v2i64
2712     //
2713     // i64 = bitcast
2714     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2715     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2716     //
2717     if (NewNumElts % OldNumElts != 0)
2718       return UnableToLegalize;
2719 
2720     // Type of the intermediate result vector.
2721     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2722     LLT MidTy =
2723         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2724 
2725     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2726 
2727     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2728     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2729 
2730     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2731       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2732       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2733       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2734       NewOps[I] = Elt.getReg(0);
2735     }
2736 
2737     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2738     MIRBuilder.buildBitcast(Dst, NewVec);
2739     MI.eraseFromParent();
2740     return Legalized;
2741   }
2742 
2743   if (NewNumElts < OldNumElts) {
2744     if (NewEltSize % OldEltSize != 0)
2745       return UnableToLegalize;
2746 
2747     // This only depends on powers of 2 because we use bit tricks to figure out
2748     // the bit offset we need to shift to get the target element. A general
2749     // expansion could emit division/multiply.
2750     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2751       return UnableToLegalize;
2752 
2753     // Increasing the vector element size.
2754     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2755     //
2756     //   =>
2757     //
2758     // %cast = G_BITCAST %vec
2759     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2760     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2761     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2762     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2763     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2764     // %elt = G_TRUNC %elt_bits
2765 
2766     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2767     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2768 
2769     // Divide to get the index in the wider element type.
2770     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2771 
2772     Register WideElt = CastVec;
2773     if (CastTy.isVector()) {
2774       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2775                                                      ScaledIdx).getReg(0);
2776     }
2777 
2778     // Compute the bit offset into the register of the target element.
2779     Register OffsetBits = getBitcastWiderVectorElementOffset(
2780       MIRBuilder, Idx, NewEltSize, OldEltSize);
2781 
2782     // Shift the wide element to get the target element.
2783     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2784     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2785     MI.eraseFromParent();
2786     return Legalized;
2787   }
2788 
2789   return UnableToLegalize;
2790 }
2791 
2792 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2793 /// TargetReg, while preserving other bits in \p TargetReg.
2794 ///
2795 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2796 static Register buildBitFieldInsert(MachineIRBuilder &B,
2797                                     Register TargetReg, Register InsertReg,
2798                                     Register OffsetBits) {
2799   LLT TargetTy = B.getMRI()->getType(TargetReg);
2800   LLT InsertTy = B.getMRI()->getType(InsertReg);
2801   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2802   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2803 
2804   // Produce a bitmask of the value to insert
2805   auto EltMask = B.buildConstant(
2806     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2807                                    InsertTy.getSizeInBits()));
2808   // Shift it into position
2809   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2810   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2811 
2812   // Clear out the bits in the wide element
2813   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2814 
2815   // The value to insert has all zeros already, so stick it into the masked
2816   // wide element.
2817   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2818 }
2819 
2820 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2821 /// is increasing the element size, perform the indexing in the target element
2822 /// type, and use bit operations to insert at the element position. This is
2823 /// intended for architectures that can dynamically index the register file and
2824 /// want to force indexing in the native register size.
2825 LegalizerHelper::LegalizeResult
2826 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2827                                         LLT CastTy) {
2828   if (TypeIdx != 0)
2829     return UnableToLegalize;
2830 
2831   Register Dst = MI.getOperand(0).getReg();
2832   Register SrcVec = MI.getOperand(1).getReg();
2833   Register Val = MI.getOperand(2).getReg();
2834   Register Idx = MI.getOperand(3).getReg();
2835 
2836   LLT VecTy = MRI.getType(Dst);
2837   LLT IdxTy = MRI.getType(Idx);
2838 
2839   LLT VecEltTy = VecTy.getElementType();
2840   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2841   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2842   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2843 
2844   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2845   unsigned OldNumElts = VecTy.getNumElements();
2846 
2847   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2848   if (NewNumElts < OldNumElts) {
2849     if (NewEltSize % OldEltSize != 0)
2850       return UnableToLegalize;
2851 
2852     // This only depends on powers of 2 because we use bit tricks to figure out
2853     // the bit offset we need to shift to get the target element. A general
2854     // expansion could emit division/multiply.
2855     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2856       return UnableToLegalize;
2857 
2858     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2859     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2860 
2861     // Divide to get the index in the wider element type.
2862     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2863 
2864     Register ExtractedElt = CastVec;
2865     if (CastTy.isVector()) {
2866       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2867                                                           ScaledIdx).getReg(0);
2868     }
2869 
2870     // Compute the bit offset into the register of the target element.
2871     Register OffsetBits = getBitcastWiderVectorElementOffset(
2872       MIRBuilder, Idx, NewEltSize, OldEltSize);
2873 
2874     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2875                                                Val, OffsetBits);
2876     if (CastTy.isVector()) {
2877       InsertedElt = MIRBuilder.buildInsertVectorElement(
2878         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2879     }
2880 
2881     MIRBuilder.buildBitcast(Dst, InsertedElt);
2882     MI.eraseFromParent();
2883     return Legalized;
2884   }
2885 
2886   return UnableToLegalize;
2887 }
2888 
2889 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2890   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2891   Register DstReg = LoadMI.getDstReg();
2892   Register PtrReg = LoadMI.getPointerReg();
2893   LLT DstTy = MRI.getType(DstReg);
2894   MachineMemOperand &MMO = LoadMI.getMMO();
2895   LLT MemTy = MMO.getMemoryType();
2896   MachineFunction &MF = MIRBuilder.getMF();
2897 
2898   unsigned MemSizeInBits = MemTy.getSizeInBits();
2899   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2900 
2901   if (MemSizeInBits != MemStoreSizeInBits) {
2902     if (MemTy.isVector())
2903       return UnableToLegalize;
2904 
2905     // Promote to a byte-sized load if not loading an integral number of
2906     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2907     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2908     MachineMemOperand *NewMMO =
2909         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2910 
2911     Register LoadReg = DstReg;
2912     LLT LoadTy = DstTy;
2913 
2914     // If this wasn't already an extending load, we need to widen the result
2915     // register to avoid creating a load with a narrower result than the source.
2916     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2917       LoadTy = WideMemTy;
2918       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2919     }
2920 
2921     if (isa<GSExtLoad>(LoadMI)) {
2922       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2923       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2924     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) {
2925       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2926       // The extra bits are guaranteed to be zero, since we stored them that
2927       // way.  A zext load from Wide thus automatically gives zext from MemVT.
2928       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
2929     } else {
2930       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
2931     }
2932 
2933     if (DstTy != LoadTy)
2934       MIRBuilder.buildTrunc(DstReg, LoadReg);
2935 
2936     LoadMI.eraseFromParent();
2937     return Legalized;
2938   }
2939 
2940   // Big endian lowering not implemented.
2941   if (MIRBuilder.getDataLayout().isBigEndian())
2942     return UnableToLegalize;
2943 
2944   // This load needs splitting into power of 2 sized loads.
2945   //
2946   // Our strategy here is to generate anyextending loads for the smaller
2947   // types up to next power-2 result type, and then combine the two larger
2948   // result values together, before truncating back down to the non-pow-2
2949   // type.
2950   // E.g. v1 = i24 load =>
2951   // v2 = i32 zextload (2 byte)
2952   // v3 = i32 load (1 byte)
2953   // v4 = i32 shl v3, 16
2954   // v5 = i32 or v4, v2
2955   // v1 = i24 trunc v5
2956   // By doing this we generate the correct truncate which should get
2957   // combined away as an artifact with a matching extend.
2958 
2959   uint64_t LargeSplitSize, SmallSplitSize;
2960 
2961   if (!isPowerOf2_32(MemSizeInBits)) {
2962     // This load needs splitting into power of 2 sized loads.
2963     LargeSplitSize = PowerOf2Floor(MemSizeInBits);
2964     SmallSplitSize = MemSizeInBits - LargeSplitSize;
2965   } else {
2966     // This is already a power of 2, but we still need to split this in half.
2967     //
2968     // Assume we're being asked to decompose an unaligned load.
2969     // TODO: If this requires multiple splits, handle them all at once.
2970     auto &Ctx = MF.getFunction().getContext();
2971     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
2972       return UnableToLegalize;
2973 
2974     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
2975   }
2976 
2977   if (MemTy.isVector()) {
2978     // TODO: Handle vector extloads
2979     if (MemTy != DstTy)
2980       return UnableToLegalize;
2981 
2982     // TODO: We can do better than scalarizing the vector and at least split it
2983     // in half.
2984     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
2985   }
2986 
2987   MachineMemOperand *LargeMMO =
2988       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
2989   MachineMemOperand *SmallMMO =
2990       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
2991 
2992   LLT PtrTy = MRI.getType(PtrReg);
2993   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
2994   LLT AnyExtTy = LLT::scalar(AnyExtSize);
2995   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
2996                                              PtrReg, *LargeMMO);
2997 
2998   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
2999                                             LargeSplitSize / 8);
3000   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3001   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3002   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3003                                              SmallPtr, *SmallMMO);
3004 
3005   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3006   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3007 
3008   if (AnyExtTy == DstTy)
3009     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3010   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3011     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3012     MIRBuilder.buildTrunc(DstReg, {Or});
3013   } else {
3014     assert(DstTy.isPointer() && "expected pointer");
3015     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3016 
3017     // FIXME: We currently consider this to be illegal for non-integral address
3018     // spaces, but we need still need a way to reinterpret the bits.
3019     MIRBuilder.buildIntToPtr(DstReg, Or);
3020   }
3021 
3022   LoadMI.eraseFromParent();
3023   return Legalized;
3024 }
3025 
3026 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3027   // Lower a non-power of 2 store into multiple pow-2 stores.
3028   // E.g. split an i24 store into an i16 store + i8 store.
3029   // We do this by first extending the stored value to the next largest power
3030   // of 2 type, and then using truncating stores to store the components.
3031   // By doing this, likewise with G_LOAD, generate an extend that can be
3032   // artifact-combined away instead of leaving behind extracts.
3033   Register SrcReg = StoreMI.getValueReg();
3034   Register PtrReg = StoreMI.getPointerReg();
3035   LLT SrcTy = MRI.getType(SrcReg);
3036   MachineFunction &MF = MIRBuilder.getMF();
3037   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3038   LLT MemTy = MMO.getMemoryType();
3039 
3040   unsigned StoreWidth = MemTy.getSizeInBits();
3041   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3042 
3043   if (StoreWidth != StoreSizeInBits) {
3044     if (SrcTy.isVector())
3045       return UnableToLegalize;
3046 
3047     // Promote to a byte-sized store with upper bits zero if not
3048     // storing an integral number of bytes.  For example, promote
3049     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3050     LLT WideTy = LLT::scalar(StoreSizeInBits);
3051 
3052     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3053       // Avoid creating a store with a narrower source than result.
3054       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3055       SrcTy = WideTy;
3056     }
3057 
3058     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3059 
3060     MachineMemOperand *NewMMO =
3061         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3062     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3063     StoreMI.eraseFromParent();
3064     return Legalized;
3065   }
3066 
3067   if (MemTy.isVector()) {
3068     // TODO: Handle vector trunc stores
3069     if (MemTy != SrcTy)
3070       return UnableToLegalize;
3071 
3072     // TODO: We can do better than scalarizing the vector and at least split it
3073     // in half.
3074     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3075   }
3076 
3077   unsigned MemSizeInBits = MemTy.getSizeInBits();
3078   uint64_t LargeSplitSize, SmallSplitSize;
3079 
3080   if (!isPowerOf2_32(MemSizeInBits)) {
3081     LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
3082     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3083   } else {
3084     auto &Ctx = MF.getFunction().getContext();
3085     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3086       return UnableToLegalize; // Don't know what we're being asked to do.
3087 
3088     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3089   }
3090 
3091   // Extend to the next pow-2. If this store was itself the result of lowering,
3092   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3093   // that's wider than the stored size.
3094   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3095   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3096 
3097   if (SrcTy.isPointer()) {
3098     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3099     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3100   }
3101 
3102   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3103 
3104   // Obtain the smaller value by shifting away the larger value.
3105   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3106   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3107 
3108   // Generate the PtrAdd and truncating stores.
3109   LLT PtrTy = MRI.getType(PtrReg);
3110   auto OffsetCst = MIRBuilder.buildConstant(
3111     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3112   auto SmallPtr =
3113     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3114 
3115   MachineMemOperand *LargeMMO =
3116     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3117   MachineMemOperand *SmallMMO =
3118     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3119   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3120   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3121   StoreMI.eraseFromParent();
3122   return Legalized;
3123 }
3124 
3125 LegalizerHelper::LegalizeResult
3126 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3127   switch (MI.getOpcode()) {
3128   case TargetOpcode::G_LOAD: {
3129     if (TypeIdx != 0)
3130       return UnableToLegalize;
3131     MachineMemOperand &MMO = **MI.memoperands_begin();
3132 
3133     // Not sure how to interpret a bitcast of an extending load.
3134     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3135       return UnableToLegalize;
3136 
3137     Observer.changingInstr(MI);
3138     bitcastDst(MI, CastTy, 0);
3139     MMO.setType(CastTy);
3140     Observer.changedInstr(MI);
3141     return Legalized;
3142   }
3143   case TargetOpcode::G_STORE: {
3144     if (TypeIdx != 0)
3145       return UnableToLegalize;
3146 
3147     MachineMemOperand &MMO = **MI.memoperands_begin();
3148 
3149     // Not sure how to interpret a bitcast of a truncating store.
3150     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3151       return UnableToLegalize;
3152 
3153     Observer.changingInstr(MI);
3154     bitcastSrc(MI, CastTy, 0);
3155     MMO.setType(CastTy);
3156     Observer.changedInstr(MI);
3157     return Legalized;
3158   }
3159   case TargetOpcode::G_SELECT: {
3160     if (TypeIdx != 0)
3161       return UnableToLegalize;
3162 
3163     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3164       LLVM_DEBUG(
3165           dbgs() << "bitcast action not implemented for vector select\n");
3166       return UnableToLegalize;
3167     }
3168 
3169     Observer.changingInstr(MI);
3170     bitcastSrc(MI, CastTy, 2);
3171     bitcastSrc(MI, CastTy, 3);
3172     bitcastDst(MI, CastTy, 0);
3173     Observer.changedInstr(MI);
3174     return Legalized;
3175   }
3176   case TargetOpcode::G_AND:
3177   case TargetOpcode::G_OR:
3178   case TargetOpcode::G_XOR: {
3179     Observer.changingInstr(MI);
3180     bitcastSrc(MI, CastTy, 1);
3181     bitcastSrc(MI, CastTy, 2);
3182     bitcastDst(MI, CastTy, 0);
3183     Observer.changedInstr(MI);
3184     return Legalized;
3185   }
3186   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3187     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3188   case TargetOpcode::G_INSERT_VECTOR_ELT:
3189     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3190   default:
3191     return UnableToLegalize;
3192   }
3193 }
3194 
3195 // Legalize an instruction by changing the opcode in place.
3196 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3197     Observer.changingInstr(MI);
3198     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3199     Observer.changedInstr(MI);
3200 }
3201 
3202 LegalizerHelper::LegalizeResult
3203 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3204   using namespace TargetOpcode;
3205 
3206   switch(MI.getOpcode()) {
3207   default:
3208     return UnableToLegalize;
3209   case TargetOpcode::G_BITCAST:
3210     return lowerBitcast(MI);
3211   case TargetOpcode::G_SREM:
3212   case TargetOpcode::G_UREM: {
3213     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3214     auto Quot =
3215         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3216                               {MI.getOperand(1), MI.getOperand(2)});
3217 
3218     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3219     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3220     MI.eraseFromParent();
3221     return Legalized;
3222   }
3223   case TargetOpcode::G_SADDO:
3224   case TargetOpcode::G_SSUBO:
3225     return lowerSADDO_SSUBO(MI);
3226   case TargetOpcode::G_UMULH:
3227   case TargetOpcode::G_SMULH:
3228     return lowerSMULH_UMULH(MI);
3229   case TargetOpcode::G_SMULO:
3230   case TargetOpcode::G_UMULO: {
3231     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3232     // result.
3233     Register Res = MI.getOperand(0).getReg();
3234     Register Overflow = MI.getOperand(1).getReg();
3235     Register LHS = MI.getOperand(2).getReg();
3236     Register RHS = MI.getOperand(3).getReg();
3237     LLT Ty = MRI.getType(Res);
3238 
3239     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3240                           ? TargetOpcode::G_SMULH
3241                           : TargetOpcode::G_UMULH;
3242 
3243     Observer.changingInstr(MI);
3244     const auto &TII = MIRBuilder.getTII();
3245     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3246     MI.RemoveOperand(1);
3247     Observer.changedInstr(MI);
3248 
3249     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3250     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3251 
3252     // Move insert point forward so we can use the Res register if needed.
3253     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3254 
3255     // For *signed* multiply, overflow is detected by checking:
3256     // (hi != (lo >> bitwidth-1))
3257     if (Opcode == TargetOpcode::G_SMULH) {
3258       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3259       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3260       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3261     } else {
3262       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3263     }
3264     return Legalized;
3265   }
3266   case TargetOpcode::G_FNEG: {
3267     Register Res = MI.getOperand(0).getReg();
3268     LLT Ty = MRI.getType(Res);
3269 
3270     // TODO: Handle vector types once we are able to
3271     // represent them.
3272     if (Ty.isVector())
3273       return UnableToLegalize;
3274     auto SignMask =
3275         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3276     Register SubByReg = MI.getOperand(1).getReg();
3277     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3278     MI.eraseFromParent();
3279     return Legalized;
3280   }
3281   case TargetOpcode::G_FSUB: {
3282     Register Res = MI.getOperand(0).getReg();
3283     LLT Ty = MRI.getType(Res);
3284 
3285     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3286     // First, check if G_FNEG is marked as Lower. If so, we may
3287     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3288     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3289       return UnableToLegalize;
3290     Register LHS = MI.getOperand(1).getReg();
3291     Register RHS = MI.getOperand(2).getReg();
3292     Register Neg = MRI.createGenericVirtualRegister(Ty);
3293     MIRBuilder.buildFNeg(Neg, RHS);
3294     MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3295     MI.eraseFromParent();
3296     return Legalized;
3297   }
3298   case TargetOpcode::G_FMAD:
3299     return lowerFMad(MI);
3300   case TargetOpcode::G_FFLOOR:
3301     return lowerFFloor(MI);
3302   case TargetOpcode::G_INTRINSIC_ROUND:
3303     return lowerIntrinsicRound(MI);
3304   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3305     // Since round even is the assumed rounding mode for unconstrained FP
3306     // operations, rint and roundeven are the same operation.
3307     changeOpcode(MI, TargetOpcode::G_FRINT);
3308     return Legalized;
3309   }
3310   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3311     Register OldValRes = MI.getOperand(0).getReg();
3312     Register SuccessRes = MI.getOperand(1).getReg();
3313     Register Addr = MI.getOperand(2).getReg();
3314     Register CmpVal = MI.getOperand(3).getReg();
3315     Register NewVal = MI.getOperand(4).getReg();
3316     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3317                                   **MI.memoperands_begin());
3318     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3319     MI.eraseFromParent();
3320     return Legalized;
3321   }
3322   case TargetOpcode::G_LOAD:
3323   case TargetOpcode::G_SEXTLOAD:
3324   case TargetOpcode::G_ZEXTLOAD:
3325     return lowerLoad(cast<GAnyLoad>(MI));
3326   case TargetOpcode::G_STORE:
3327     return lowerStore(cast<GStore>(MI));
3328   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3329   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3330   case TargetOpcode::G_CTLZ:
3331   case TargetOpcode::G_CTTZ:
3332   case TargetOpcode::G_CTPOP:
3333     return lowerBitCount(MI);
3334   case G_UADDO: {
3335     Register Res = MI.getOperand(0).getReg();
3336     Register CarryOut = MI.getOperand(1).getReg();
3337     Register LHS = MI.getOperand(2).getReg();
3338     Register RHS = MI.getOperand(3).getReg();
3339 
3340     MIRBuilder.buildAdd(Res, LHS, RHS);
3341     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3342 
3343     MI.eraseFromParent();
3344     return Legalized;
3345   }
3346   case G_UADDE: {
3347     Register Res = MI.getOperand(0).getReg();
3348     Register CarryOut = MI.getOperand(1).getReg();
3349     Register LHS = MI.getOperand(2).getReg();
3350     Register RHS = MI.getOperand(3).getReg();
3351     Register CarryIn = MI.getOperand(4).getReg();
3352     LLT Ty = MRI.getType(Res);
3353 
3354     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3355     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3356     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3357     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3358 
3359     MI.eraseFromParent();
3360     return Legalized;
3361   }
3362   case G_USUBO: {
3363     Register Res = MI.getOperand(0).getReg();
3364     Register BorrowOut = MI.getOperand(1).getReg();
3365     Register LHS = MI.getOperand(2).getReg();
3366     Register RHS = MI.getOperand(3).getReg();
3367 
3368     MIRBuilder.buildSub(Res, LHS, RHS);
3369     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3370 
3371     MI.eraseFromParent();
3372     return Legalized;
3373   }
3374   case G_USUBE: {
3375     Register Res = MI.getOperand(0).getReg();
3376     Register BorrowOut = MI.getOperand(1).getReg();
3377     Register LHS = MI.getOperand(2).getReg();
3378     Register RHS = MI.getOperand(3).getReg();
3379     Register BorrowIn = MI.getOperand(4).getReg();
3380     const LLT CondTy = MRI.getType(BorrowOut);
3381     const LLT Ty = MRI.getType(Res);
3382 
3383     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3384     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3385     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3386 
3387     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3388     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3389     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3390 
3391     MI.eraseFromParent();
3392     return Legalized;
3393   }
3394   case G_UITOFP:
3395     return lowerUITOFP(MI);
3396   case G_SITOFP:
3397     return lowerSITOFP(MI);
3398   case G_FPTOUI:
3399     return lowerFPTOUI(MI);
3400   case G_FPTOSI:
3401     return lowerFPTOSI(MI);
3402   case G_FPTRUNC:
3403     return lowerFPTRUNC(MI);
3404   case G_FPOWI:
3405     return lowerFPOWI(MI);
3406   case G_SMIN:
3407   case G_SMAX:
3408   case G_UMIN:
3409   case G_UMAX:
3410     return lowerMinMax(MI);
3411   case G_FCOPYSIGN:
3412     return lowerFCopySign(MI);
3413   case G_FMINNUM:
3414   case G_FMAXNUM:
3415     return lowerFMinNumMaxNum(MI);
3416   case G_MERGE_VALUES:
3417     return lowerMergeValues(MI);
3418   case G_UNMERGE_VALUES:
3419     return lowerUnmergeValues(MI);
3420   case TargetOpcode::G_SEXT_INREG: {
3421     assert(MI.getOperand(2).isImm() && "Expected immediate");
3422     int64_t SizeInBits = MI.getOperand(2).getImm();
3423 
3424     Register DstReg = MI.getOperand(0).getReg();
3425     Register SrcReg = MI.getOperand(1).getReg();
3426     LLT DstTy = MRI.getType(DstReg);
3427     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3428 
3429     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3430     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3431     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3432     MI.eraseFromParent();
3433     return Legalized;
3434   }
3435   case G_EXTRACT_VECTOR_ELT:
3436   case G_INSERT_VECTOR_ELT:
3437     return lowerExtractInsertVectorElt(MI);
3438   case G_SHUFFLE_VECTOR:
3439     return lowerShuffleVector(MI);
3440   case G_DYN_STACKALLOC:
3441     return lowerDynStackAlloc(MI);
3442   case G_EXTRACT:
3443     return lowerExtract(MI);
3444   case G_INSERT:
3445     return lowerInsert(MI);
3446   case G_BSWAP:
3447     return lowerBswap(MI);
3448   case G_BITREVERSE:
3449     return lowerBitreverse(MI);
3450   case G_READ_REGISTER:
3451   case G_WRITE_REGISTER:
3452     return lowerReadWriteRegister(MI);
3453   case G_UADDSAT:
3454   case G_USUBSAT: {
3455     // Try to make a reasonable guess about which lowering strategy to use. The
3456     // target can override this with custom lowering and calling the
3457     // implementation functions.
3458     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3459     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3460       return lowerAddSubSatToMinMax(MI);
3461     return lowerAddSubSatToAddoSubo(MI);
3462   }
3463   case G_SADDSAT:
3464   case G_SSUBSAT: {
3465     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3466 
3467     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3468     // since it's a shorter expansion. However, we would need to figure out the
3469     // preferred boolean type for the carry out for the query.
3470     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3471       return lowerAddSubSatToMinMax(MI);
3472     return lowerAddSubSatToAddoSubo(MI);
3473   }
3474   case G_SSHLSAT:
3475   case G_USHLSAT:
3476     return lowerShlSat(MI);
3477   case G_ABS:
3478     return lowerAbsToAddXor(MI);
3479   case G_SELECT:
3480     return lowerSelect(MI);
3481   case G_SDIVREM:
3482   case G_UDIVREM:
3483     return lowerDIVREM(MI);
3484   case G_FSHL:
3485   case G_FSHR:
3486     return lowerFunnelShift(MI);
3487   case G_ROTL:
3488   case G_ROTR:
3489     return lowerRotate(MI);
3490   case G_ISNAN:
3491     return lowerIsNaN(MI);
3492   GISEL_VECREDUCE_CASES_NONSEQ
3493     return lowerVectorReduction(MI);
3494   }
3495 }
3496 
3497 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3498                                                   Align MinAlign) const {
3499   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3500   // datalayout for the preferred alignment. Also there should be a target hook
3501   // for this to allow targets to reduce the alignment and ignore the
3502   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3503   // the type.
3504   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3505 }
3506 
3507 MachineInstrBuilder
3508 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3509                                       MachinePointerInfo &PtrInfo) {
3510   MachineFunction &MF = MIRBuilder.getMF();
3511   const DataLayout &DL = MIRBuilder.getDataLayout();
3512   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3513 
3514   unsigned AddrSpace = DL.getAllocaAddrSpace();
3515   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3516 
3517   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3518   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3519 }
3520 
3521 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3522                                         LLT VecTy) {
3523   int64_t IdxVal;
3524   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3525     return IdxReg;
3526 
3527   LLT IdxTy = B.getMRI()->getType(IdxReg);
3528   unsigned NElts = VecTy.getNumElements();
3529   if (isPowerOf2_32(NElts)) {
3530     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3531     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3532   }
3533 
3534   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3535       .getReg(0);
3536 }
3537 
3538 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3539                                                   Register Index) {
3540   LLT EltTy = VecTy.getElementType();
3541 
3542   // Calculate the element offset and add it to the pointer.
3543   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3544   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3545          "Converting bits to bytes lost precision");
3546 
3547   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3548 
3549   LLT IdxTy = MRI.getType(Index);
3550   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3551                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3552 
3553   LLT PtrTy = MRI.getType(VecPtr);
3554   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3555 }
3556 
3557 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef(
3558     MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) {
3559   Register DstReg = MI.getOperand(0).getReg();
3560   LLT DstTy = MRI.getType(DstReg);
3561   LLT LCMTy = getLCMType(DstTy, NarrowTy);
3562 
3563   unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
3564 
3565   auto NewUndef = MIRBuilder.buildUndef(NarrowTy);
3566   SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0));
3567 
3568   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
3569   MI.eraseFromParent();
3570   return Legalized;
3571 }
3572 
3573 // Handle splitting vector operations which need to have the same number of
3574 // elements in each type index, but each type index may have a different element
3575 // type.
3576 //
3577 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3578 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3579 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3580 //
3581 // Also handles some irregular breakdown cases, e.g.
3582 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3583 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3584 //             s64 = G_SHL s64, s32
3585 LegalizerHelper::LegalizeResult
3586 LegalizerHelper::fewerElementsVectorMultiEltType(
3587   MachineInstr &MI, unsigned TypeIdx, LLT NarrowTyArg) {
3588   if (TypeIdx != 0)
3589     return UnableToLegalize;
3590 
3591   const LLT NarrowTy0 = NarrowTyArg;
3592   const Register DstReg = MI.getOperand(0).getReg();
3593   LLT DstTy = MRI.getType(DstReg);
3594   LLT LeftoverTy0;
3595 
3596   // All of the operands need to have the same number of elements, so if we can
3597   // determine a type breakdown for the result type, we can for all of the
3598   // source types.
3599   int NumParts = getNarrowTypeBreakDown(DstTy, NarrowTy0, LeftoverTy0).first;
3600   if (NumParts < 0)
3601     return UnableToLegalize;
3602 
3603   SmallVector<MachineInstrBuilder, 4> NewInsts;
3604 
3605   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3606   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3607 
3608   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
3609     Register SrcReg = MI.getOperand(I).getReg();
3610     LLT SrcTyI = MRI.getType(SrcReg);
3611     const auto NewEC = NarrowTy0.isVector() ? NarrowTy0.getElementCount()
3612                                             : ElementCount::getFixed(1);
3613     LLT NarrowTyI = LLT::scalarOrVector(NewEC, SrcTyI.getScalarType());
3614     LLT LeftoverTyI;
3615 
3616     // Split this operand into the requested typed registers, and any leftover
3617     // required to reproduce the original type.
3618     if (!extractParts(SrcReg, SrcTyI, NarrowTyI, LeftoverTyI, PartRegs,
3619                       LeftoverRegs))
3620       return UnableToLegalize;
3621 
3622     if (I == 1) {
3623       // For the first operand, create an instruction for each part and setup
3624       // the result.
3625       for (Register PartReg : PartRegs) {
3626         Register PartDstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3627         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3628                                .addDef(PartDstReg)
3629                                .addUse(PartReg));
3630         DstRegs.push_back(PartDstReg);
3631       }
3632 
3633       for (Register LeftoverReg : LeftoverRegs) {
3634         Register PartDstReg = MRI.createGenericVirtualRegister(LeftoverTy0);
3635         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3636                                .addDef(PartDstReg)
3637                                .addUse(LeftoverReg));
3638         LeftoverDstRegs.push_back(PartDstReg);
3639       }
3640     } else {
3641       assert(NewInsts.size() == PartRegs.size() + LeftoverRegs.size());
3642 
3643       // Add the newly created operand splits to the existing instructions. The
3644       // odd-sized pieces are ordered after the requested NarrowTyArg sized
3645       // pieces.
3646       unsigned InstCount = 0;
3647       for (unsigned J = 0, JE = PartRegs.size(); J != JE; ++J)
3648         NewInsts[InstCount++].addUse(PartRegs[J]);
3649       for (unsigned J = 0, JE = LeftoverRegs.size(); J != JE; ++J)
3650         NewInsts[InstCount++].addUse(LeftoverRegs[J]);
3651     }
3652 
3653     PartRegs.clear();
3654     LeftoverRegs.clear();
3655   }
3656 
3657   // Insert the newly built operations and rebuild the result register.
3658   for (auto &MIB : NewInsts)
3659     MIRBuilder.insertInstr(MIB);
3660 
3661   insertParts(DstReg, DstTy, NarrowTy0, DstRegs, LeftoverTy0, LeftoverDstRegs);
3662 
3663   MI.eraseFromParent();
3664   return Legalized;
3665 }
3666 
3667 LegalizerHelper::LegalizeResult
3668 LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx,
3669                                           LLT NarrowTy) {
3670   if (TypeIdx != 0)
3671     return UnableToLegalize;
3672 
3673   Register DstReg = MI.getOperand(0).getReg();
3674   Register SrcReg = MI.getOperand(1).getReg();
3675   LLT DstTy = MRI.getType(DstReg);
3676   LLT SrcTy = MRI.getType(SrcReg);
3677 
3678   LLT NarrowTy0 = NarrowTy;
3679   LLT NarrowTy1;
3680   unsigned NumParts;
3681 
3682   if (NarrowTy.isVector()) {
3683     // Uneven breakdown not handled.
3684     NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3685     if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements())
3686       return UnableToLegalize;
3687 
3688     NarrowTy1 = LLT::vector(NarrowTy.getElementCount(), SrcTy.getElementType());
3689   } else {
3690     NumParts = DstTy.getNumElements();
3691     NarrowTy1 = SrcTy.getElementType();
3692   }
3693 
3694   SmallVector<Register, 4> SrcRegs, DstRegs;
3695   extractParts(SrcReg, NarrowTy1, NumParts, SrcRegs);
3696 
3697   for (unsigned I = 0; I < NumParts; ++I) {
3698     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3699     MachineInstr *NewInst =
3700         MIRBuilder.buildInstr(MI.getOpcode(), {DstReg}, {SrcRegs[I]});
3701 
3702     NewInst->setFlags(MI.getFlags());
3703     DstRegs.push_back(DstReg);
3704   }
3705 
3706   if (NarrowTy.isVector())
3707     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3708   else
3709     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3710 
3711   MI.eraseFromParent();
3712   return Legalized;
3713 }
3714 
3715 LegalizerHelper::LegalizeResult
3716 LegalizerHelper::fewerElementsVectorCmp(MachineInstr &MI, unsigned TypeIdx,
3717                                         LLT NarrowTy) {
3718   Register DstReg = MI.getOperand(0).getReg();
3719   Register Src0Reg = MI.getOperand(2).getReg();
3720   LLT DstTy = MRI.getType(DstReg);
3721   LLT SrcTy = MRI.getType(Src0Reg);
3722 
3723   unsigned NumParts;
3724   LLT NarrowTy0, NarrowTy1;
3725 
3726   if (TypeIdx == 0) {
3727     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3728     unsigned OldElts = DstTy.getNumElements();
3729 
3730     NarrowTy0 = NarrowTy;
3731     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : DstTy.getNumElements();
3732     NarrowTy1 = NarrowTy.isVector() ? LLT::vector(NarrowTy.getElementCount(),
3733                                                   SrcTy.getScalarSizeInBits())
3734                                     : SrcTy.getElementType();
3735 
3736   } else {
3737     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3738     unsigned OldElts = SrcTy.getNumElements();
3739 
3740     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) :
3741       NarrowTy.getNumElements();
3742     NarrowTy0 =
3743         LLT::vector(NarrowTy.getElementCount(), DstTy.getScalarSizeInBits());
3744     NarrowTy1 = NarrowTy;
3745   }
3746 
3747   // FIXME: Don't know how to handle the situation where the small vectors
3748   // aren't all the same size yet.
3749   if (NarrowTy1.isVector() &&
3750       NarrowTy1.getNumElements() * NumParts != DstTy.getNumElements())
3751     return UnableToLegalize;
3752 
3753   CmpInst::Predicate Pred
3754     = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3755 
3756   SmallVector<Register, 2> Src1Regs, Src2Regs, DstRegs;
3757   extractParts(MI.getOperand(2).getReg(), NarrowTy1, NumParts, Src1Regs);
3758   extractParts(MI.getOperand(3).getReg(), NarrowTy1, NumParts, Src2Regs);
3759 
3760   for (unsigned I = 0; I < NumParts; ++I) {
3761     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3762     DstRegs.push_back(DstReg);
3763 
3764     if (MI.getOpcode() == TargetOpcode::G_ICMP)
3765       MIRBuilder.buildICmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3766     else {
3767       MachineInstr *NewCmp
3768         = MIRBuilder.buildFCmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3769       NewCmp->setFlags(MI.getFlags());
3770     }
3771   }
3772 
3773   if (NarrowTy1.isVector())
3774     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3775   else
3776     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3777 
3778   MI.eraseFromParent();
3779   return Legalized;
3780 }
3781 
3782 LegalizerHelper::LegalizeResult
3783 LegalizerHelper::fewerElementsVectorSelect(MachineInstr &MI, unsigned TypeIdx,
3784                                            LLT NarrowTy) {
3785   Register DstReg = MI.getOperand(0).getReg();
3786   Register CondReg = MI.getOperand(1).getReg();
3787 
3788   unsigned NumParts = 0;
3789   LLT NarrowTy0, NarrowTy1;
3790 
3791   LLT DstTy = MRI.getType(DstReg);
3792   LLT CondTy = MRI.getType(CondReg);
3793   unsigned Size = DstTy.getSizeInBits();
3794 
3795   assert(TypeIdx == 0 || CondTy.isVector());
3796 
3797   if (TypeIdx == 0) {
3798     NarrowTy0 = NarrowTy;
3799     NarrowTy1 = CondTy;
3800 
3801     unsigned NarrowSize = NarrowTy0.getSizeInBits();
3802     // FIXME: Don't know how to handle the situation where the small vectors
3803     // aren't all the same size yet.
3804     if (Size % NarrowSize != 0)
3805       return UnableToLegalize;
3806 
3807     NumParts = Size / NarrowSize;
3808 
3809     // Need to break down the condition type
3810     if (CondTy.isVector()) {
3811       if (CondTy.getNumElements() == NumParts)
3812         NarrowTy1 = CondTy.getElementType();
3813       else
3814         NarrowTy1 =
3815             LLT::vector(CondTy.getElementCount().divideCoefficientBy(NumParts),
3816                         CondTy.getScalarSizeInBits());
3817     }
3818   } else {
3819     NumParts = CondTy.getNumElements();
3820     if (NarrowTy.isVector()) {
3821       // TODO: Handle uneven breakdown.
3822       if (NumParts * NarrowTy.getNumElements() != CondTy.getNumElements())
3823         return UnableToLegalize;
3824 
3825       return UnableToLegalize;
3826     } else {
3827       NarrowTy0 = DstTy.getElementType();
3828       NarrowTy1 = NarrowTy;
3829     }
3830   }
3831 
3832   SmallVector<Register, 2> DstRegs, Src0Regs, Src1Regs, Src2Regs;
3833   if (CondTy.isVector())
3834     extractParts(MI.getOperand(1).getReg(), NarrowTy1, NumParts, Src0Regs);
3835 
3836   extractParts(MI.getOperand(2).getReg(), NarrowTy0, NumParts, Src1Regs);
3837   extractParts(MI.getOperand(3).getReg(), NarrowTy0, NumParts, Src2Regs);
3838 
3839   for (unsigned i = 0; i < NumParts; ++i) {
3840     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3841     MIRBuilder.buildSelect(DstReg, CondTy.isVector() ? Src0Regs[i] : CondReg,
3842                            Src1Regs[i], Src2Regs[i]);
3843     DstRegs.push_back(DstReg);
3844   }
3845 
3846   if (NarrowTy0.isVector())
3847     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3848   else
3849     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3850 
3851   MI.eraseFromParent();
3852   return Legalized;
3853 }
3854 
3855 LegalizerHelper::LegalizeResult
3856 LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
3857                                         LLT NarrowTy) {
3858   const Register DstReg = MI.getOperand(0).getReg();
3859   LLT PhiTy = MRI.getType(DstReg);
3860   LLT LeftoverTy;
3861 
3862   // All of the operands need to have the same number of elements, so if we can
3863   // determine a type breakdown for the result type, we can for all of the
3864   // source types.
3865   int NumParts, NumLeftover;
3866   std::tie(NumParts, NumLeftover)
3867     = getNarrowTypeBreakDown(PhiTy, NarrowTy, LeftoverTy);
3868   if (NumParts < 0)
3869     return UnableToLegalize;
3870 
3871   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3872   SmallVector<MachineInstrBuilder, 4> NewInsts;
3873 
3874   const int TotalNumParts = NumParts + NumLeftover;
3875 
3876   // Insert the new phis in the result block first.
3877   for (int I = 0; I != TotalNumParts; ++I) {
3878     LLT Ty = I < NumParts ? NarrowTy : LeftoverTy;
3879     Register PartDstReg = MRI.createGenericVirtualRegister(Ty);
3880     NewInsts.push_back(MIRBuilder.buildInstr(TargetOpcode::G_PHI)
3881                        .addDef(PartDstReg));
3882     if (I < NumParts)
3883       DstRegs.push_back(PartDstReg);
3884     else
3885       LeftoverDstRegs.push_back(PartDstReg);
3886   }
3887 
3888   MachineBasicBlock *MBB = MI.getParent();
3889   MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI());
3890   insertParts(DstReg, PhiTy, NarrowTy, DstRegs, LeftoverTy, LeftoverDstRegs);
3891 
3892   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3893 
3894   // Insert code to extract the incoming values in each predecessor block.
3895   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3896     PartRegs.clear();
3897     LeftoverRegs.clear();
3898 
3899     Register SrcReg = MI.getOperand(I).getReg();
3900     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
3901     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3902 
3903     LLT Unused;
3904     if (!extractParts(SrcReg, PhiTy, NarrowTy, Unused, PartRegs,
3905                       LeftoverRegs))
3906       return UnableToLegalize;
3907 
3908     // Add the newly created operand splits to the existing instructions. The
3909     // odd-sized pieces are ordered after the requested NarrowTyArg sized
3910     // pieces.
3911     for (int J = 0; J != TotalNumParts; ++J) {
3912       MachineInstrBuilder MIB = NewInsts[J];
3913       MIB.addUse(J < NumParts ? PartRegs[J] : LeftoverRegs[J - NumParts]);
3914       MIB.addMBB(&OpMBB);
3915     }
3916   }
3917 
3918   MI.eraseFromParent();
3919   return Legalized;
3920 }
3921 
3922 LegalizerHelper::LegalizeResult
3923 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3924                                                   unsigned TypeIdx,
3925                                                   LLT NarrowTy) {
3926   if (TypeIdx != 1)
3927     return UnableToLegalize;
3928 
3929   const int NumDst = MI.getNumOperands() - 1;
3930   const Register SrcReg = MI.getOperand(NumDst).getReg();
3931   LLT SrcTy = MRI.getType(SrcReg);
3932 
3933   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3934 
3935   // TODO: Create sequence of extracts.
3936   if (DstTy == NarrowTy)
3937     return UnableToLegalize;
3938 
3939   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3940   if (DstTy == GCDTy) {
3941     // This would just be a copy of the same unmerge.
3942     // TODO: Create extracts, pad with undef and create intermediate merges.
3943     return UnableToLegalize;
3944   }
3945 
3946   auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
3947   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3948   const int PartsPerUnmerge = NumDst / NumUnmerge;
3949 
3950   for (int I = 0; I != NumUnmerge; ++I) {
3951     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3952 
3953     for (int J = 0; J != PartsPerUnmerge; ++J)
3954       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3955     MIB.addUse(Unmerge.getReg(I));
3956   }
3957 
3958   MI.eraseFromParent();
3959   return Legalized;
3960 }
3961 
3962 LegalizerHelper::LegalizeResult
3963 LegalizerHelper::fewerElementsVectorMulo(MachineInstr &MI, unsigned TypeIdx,
3964                                          LLT NarrowTy) {
3965   Register Result = MI.getOperand(0).getReg();
3966   Register Overflow = MI.getOperand(1).getReg();
3967   Register LHS = MI.getOperand(2).getReg();
3968   Register RHS = MI.getOperand(3).getReg();
3969 
3970   LLT SrcTy = MRI.getType(LHS);
3971   if (!SrcTy.isVector())
3972     return UnableToLegalize;
3973 
3974   LLT ElementType = SrcTy.getElementType();
3975   LLT OverflowElementTy = MRI.getType(Overflow).getElementType();
3976   const ElementCount NumResult = SrcTy.getElementCount();
3977   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3978 
3979   // Unmerge the operands to smaller parts of GCD type.
3980   auto UnmergeLHS = MIRBuilder.buildUnmerge(GCDTy, LHS);
3981   auto UnmergeRHS = MIRBuilder.buildUnmerge(GCDTy, RHS);
3982 
3983   const int NumOps = UnmergeLHS->getNumOperands() - 1;
3984   const ElementCount PartsPerUnmerge = NumResult.divideCoefficientBy(NumOps);
3985   LLT OverflowTy = LLT::scalarOrVector(PartsPerUnmerge, OverflowElementTy);
3986   LLT ResultTy = LLT::scalarOrVector(PartsPerUnmerge, ElementType);
3987 
3988   // Perform the operation over unmerged parts.
3989   SmallVector<Register, 8> ResultParts;
3990   SmallVector<Register, 8> OverflowParts;
3991   for (int I = 0; I != NumOps; ++I) {
3992     Register Operand1 = UnmergeLHS->getOperand(I).getReg();
3993     Register Operand2 = UnmergeRHS->getOperand(I).getReg();
3994     auto PartMul = MIRBuilder.buildInstr(MI.getOpcode(), {ResultTy, OverflowTy},
3995                                          {Operand1, Operand2});
3996     ResultParts.push_back(PartMul->getOperand(0).getReg());
3997     OverflowParts.push_back(PartMul->getOperand(1).getReg());
3998   }
3999 
4000   LLT ResultLCMTy = buildLCMMergePieces(SrcTy, NarrowTy, GCDTy, ResultParts);
4001   LLT OverflowLCMTy =
4002       LLT::scalarOrVector(ResultLCMTy.getElementCount(), OverflowElementTy);
4003 
4004   // Recombine the pieces to the original result and overflow registers.
4005   buildWidenedRemergeToDst(Result, ResultLCMTy, ResultParts);
4006   buildWidenedRemergeToDst(Overflow, OverflowLCMTy, OverflowParts);
4007   MI.eraseFromParent();
4008   return Legalized;
4009 }
4010 
4011 // Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces
4012 // a vector
4013 //
4014 // Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with
4015 // undef as necessary.
4016 //
4017 // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
4018 //   -> <2 x s16>
4019 //
4020 // %4:_(s16) = G_IMPLICIT_DEF
4021 // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
4022 // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
4023 // %7:_(<2 x s16>) = G_IMPLICIT_DEF
4024 // %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7
4025 // %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8
4026 LegalizerHelper::LegalizeResult
4027 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
4028                                           LLT NarrowTy) {
4029   Register DstReg = MI.getOperand(0).getReg();
4030   LLT DstTy = MRI.getType(DstReg);
4031   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
4032   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
4033 
4034   // Break into a common type
4035   SmallVector<Register, 16> Parts;
4036   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
4037     extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg());
4038 
4039   // Build the requested new merge, padding with undef.
4040   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts,
4041                                   TargetOpcode::G_ANYEXT);
4042 
4043   // Pack into the original result register.
4044   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4045 
4046   MI.eraseFromParent();
4047   return Legalized;
4048 }
4049 
4050 LegalizerHelper::LegalizeResult
4051 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
4052                                                            unsigned TypeIdx,
4053                                                            LLT NarrowVecTy) {
4054   Register DstReg = MI.getOperand(0).getReg();
4055   Register SrcVec = MI.getOperand(1).getReg();
4056   Register InsertVal;
4057   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
4058 
4059   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
4060   if (IsInsert)
4061     InsertVal = MI.getOperand(2).getReg();
4062 
4063   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
4064 
4065   // TODO: Handle total scalarization case.
4066   if (!NarrowVecTy.isVector())
4067     return UnableToLegalize;
4068 
4069   LLT VecTy = MRI.getType(SrcVec);
4070 
4071   // If the index is a constant, we can really break this down as you would
4072   // expect, and index into the target size pieces.
4073   int64_t IdxVal;
4074   auto MaybeCst =
4075       getConstantVRegValWithLookThrough(Idx, MRI, /*LookThroughInstrs*/ true,
4076                                         /*HandleFConstants*/ false);
4077   if (MaybeCst) {
4078     IdxVal = MaybeCst->Value.getSExtValue();
4079     // Avoid out of bounds indexing the pieces.
4080     if (IdxVal >= VecTy.getNumElements()) {
4081       MIRBuilder.buildUndef(DstReg);
4082       MI.eraseFromParent();
4083       return Legalized;
4084     }
4085 
4086     SmallVector<Register, 8> VecParts;
4087     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4088 
4089     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4090     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4091                                     TargetOpcode::G_ANYEXT);
4092 
4093     unsigned NewNumElts = NarrowVecTy.getNumElements();
4094 
4095     LLT IdxTy = MRI.getType(Idx);
4096     int64_t PartIdx = IdxVal / NewNumElts;
4097     auto NewIdx =
4098         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4099 
4100     if (IsInsert) {
4101       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4102 
4103       // Use the adjusted index to insert into one of the subvectors.
4104       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4105           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4106       VecParts[PartIdx] = InsertPart.getReg(0);
4107 
4108       // Recombine the inserted subvector with the others to reform the result
4109       // vector.
4110       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4111     } else {
4112       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4113     }
4114 
4115     MI.eraseFromParent();
4116     return Legalized;
4117   }
4118 
4119   // With a variable index, we can't perform the operation in a smaller type, so
4120   // we're forced to expand this.
4121   //
4122   // TODO: We could emit a chain of compare/select to figure out which piece to
4123   // index.
4124   return lowerExtractInsertVectorElt(MI);
4125 }
4126 
4127 LegalizerHelper::LegalizeResult
4128 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4129                                       LLT NarrowTy) {
4130   // FIXME: Don't know how to handle secondary types yet.
4131   if (TypeIdx != 0)
4132     return UnableToLegalize;
4133 
4134   // This implementation doesn't work for atomics. Give up instead of doing
4135   // something invalid.
4136   if (LdStMI.isAtomic())
4137     return UnableToLegalize;
4138 
4139   bool IsLoad = isa<GLoad>(LdStMI);
4140   Register ValReg = LdStMI.getReg(0);
4141   Register AddrReg = LdStMI.getPointerReg();
4142   LLT ValTy = MRI.getType(ValReg);
4143 
4144   // FIXME: Do we need a distinct NarrowMemory legalize action?
4145   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4146     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4147     return UnableToLegalize;
4148   }
4149 
4150   int NumParts = -1;
4151   int NumLeftover = -1;
4152   LLT LeftoverTy;
4153   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4154   if (IsLoad) {
4155     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4156   } else {
4157     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4158                      NarrowLeftoverRegs)) {
4159       NumParts = NarrowRegs.size();
4160       NumLeftover = NarrowLeftoverRegs.size();
4161     }
4162   }
4163 
4164   if (NumParts == -1)
4165     return UnableToLegalize;
4166 
4167   LLT PtrTy = MRI.getType(AddrReg);
4168   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4169 
4170   unsigned TotalSize = ValTy.getSizeInBits();
4171 
4172   // Split the load/store into PartTy sized pieces starting at Offset. If this
4173   // is a load, return the new registers in ValRegs. For a store, each elements
4174   // of ValRegs should be PartTy. Returns the next offset that needs to be
4175   // handled.
4176   auto MMO = LdStMI.getMMO();
4177   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4178                              unsigned Offset) -> unsigned {
4179     MachineFunction &MF = MIRBuilder.getMF();
4180     unsigned PartSize = PartTy.getSizeInBits();
4181     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4182          Offset += PartSize, ++Idx) {
4183       unsigned ByteOffset = Offset / 8;
4184       Register NewAddrReg;
4185 
4186       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4187 
4188       MachineMemOperand *NewMMO =
4189           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4190 
4191       if (IsLoad) {
4192         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4193         ValRegs.push_back(Dst);
4194         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4195       } else {
4196         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4197       }
4198     }
4199 
4200     return Offset;
4201   };
4202 
4203   unsigned HandledOffset = splitTypePieces(NarrowTy, NarrowRegs, 0);
4204 
4205   // Handle the rest of the register if this isn't an even type breakdown.
4206   if (LeftoverTy.isValid())
4207     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, HandledOffset);
4208 
4209   if (IsLoad) {
4210     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4211                 LeftoverTy, NarrowLeftoverRegs);
4212   }
4213 
4214   LdStMI.eraseFromParent();
4215   return Legalized;
4216 }
4217 
4218 LegalizerHelper::LegalizeResult
4219 LegalizerHelper::reduceOperationWidth(MachineInstr &MI, unsigned int TypeIdx,
4220                                       LLT NarrowTy) {
4221   assert(TypeIdx == 0 && "only one type index expected");
4222 
4223   const unsigned Opc = MI.getOpcode();
4224   const int NumDefOps = MI.getNumExplicitDefs();
4225   const int NumSrcOps = MI.getNumOperands() - NumDefOps;
4226   const unsigned Flags = MI.getFlags();
4227   const unsigned NarrowSize = NarrowTy.getSizeInBits();
4228   const LLT NarrowScalarTy = LLT::scalar(NarrowSize);
4229 
4230   assert(MI.getNumOperands() <= 4 && "expected instruction with either 1 "
4231                                      "result and 1-3 sources or 2 results and "
4232                                      "1-2 sources");
4233 
4234   SmallVector<Register, 2> DstRegs;
4235   for (int I = 0; I < NumDefOps; ++I)
4236     DstRegs.push_back(MI.getOperand(I).getReg());
4237 
4238   // First of all check whether we are narrowing (changing the element type)
4239   // or reducing the vector elements
4240   const LLT DstTy = MRI.getType(DstRegs[0]);
4241   const bool IsNarrow = NarrowTy.getScalarType() != DstTy.getScalarType();
4242 
4243   SmallVector<Register, 8> ExtractedRegs[3];
4244   SmallVector<Register, 8> Parts;
4245 
4246   // Break down all the sources into NarrowTy pieces we can operate on. This may
4247   // involve creating merges to a wider type, padded with undef.
4248   for (int I = 0; I != NumSrcOps; ++I) {
4249     Register SrcReg = MI.getOperand(I + NumDefOps).getReg();
4250     LLT SrcTy = MRI.getType(SrcReg);
4251 
4252     // The type to narrow SrcReg to. For narrowing, this is a smaller scalar.
4253     // For fewerElements, this is a smaller vector with the same element type.
4254     LLT OpNarrowTy;
4255     if (IsNarrow) {
4256       OpNarrowTy = NarrowScalarTy;
4257 
4258       // In case of narrowing, we need to cast vectors to scalars for this to
4259       // work properly
4260       // FIXME: Can we do without the bitcast here if we're narrowing?
4261       if (SrcTy.isVector()) {
4262         SrcTy = LLT::scalar(SrcTy.getSizeInBits());
4263         SrcReg = MIRBuilder.buildBitcast(SrcTy, SrcReg).getReg(0);
4264       }
4265     } else {
4266       auto NarrowEC = NarrowTy.isVector() ? NarrowTy.getElementCount()
4267                                           : ElementCount::getFixed(1);
4268       OpNarrowTy = LLT::scalarOrVector(NarrowEC, SrcTy.getScalarType());
4269     }
4270 
4271     LLT GCDTy = extractGCDType(ExtractedRegs[I], SrcTy, OpNarrowTy, SrcReg);
4272 
4273     // Build a sequence of NarrowTy pieces in ExtractedRegs for this operand.
4274     buildLCMMergePieces(SrcTy, OpNarrowTy, GCDTy, ExtractedRegs[I],
4275                         TargetOpcode::G_ANYEXT);
4276   }
4277 
4278   SmallVector<Register, 8> ResultRegs[2];
4279 
4280   // Input operands for each sub-instruction.
4281   SmallVector<SrcOp, 4> InputRegs(NumSrcOps, Register());
4282 
4283   int NumParts = ExtractedRegs[0].size();
4284   const unsigned DstSize = DstTy.getSizeInBits();
4285   const LLT DstScalarTy = LLT::scalar(DstSize);
4286 
4287   // Narrowing needs to use scalar types
4288   LLT DstLCMTy, NarrowDstTy;
4289   if (IsNarrow) {
4290     DstLCMTy = getLCMType(DstScalarTy, NarrowScalarTy);
4291     NarrowDstTy = NarrowScalarTy;
4292   } else {
4293     DstLCMTy = getLCMType(DstTy, NarrowTy);
4294     NarrowDstTy = NarrowTy;
4295   }
4296 
4297   // We widened the source registers to satisfy merge/unmerge size
4298   // constraints. We'll have some extra fully undef parts.
4299   const int NumRealParts = (DstSize + NarrowSize - 1) / NarrowSize;
4300 
4301   for (int I = 0; I != NumRealParts; ++I) {
4302     // Emit this instruction on each of the split pieces.
4303     for (int J = 0; J != NumSrcOps; ++J)
4304       InputRegs[J] = ExtractedRegs[J][I];
4305 
4306     MachineInstrBuilder Inst;
4307     if (NumDefOps == 1)
4308       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy}, InputRegs, Flags);
4309     else
4310       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy, NarrowDstTy}, InputRegs,
4311                                    Flags);
4312 
4313     for (int J = 0; J != NumDefOps; ++J)
4314       ResultRegs[J].push_back(Inst.getReg(J));
4315   }
4316 
4317   // Fill out the widened result with undef instead of creating instructions
4318   // with undef inputs.
4319   int NumUndefParts = NumParts - NumRealParts;
4320   if (NumUndefParts != 0) {
4321     Register Undef = MIRBuilder.buildUndef(NarrowDstTy).getReg(0);
4322     for (int I = 0; I != NumDefOps; ++I)
4323       ResultRegs[I].append(NumUndefParts, Undef);
4324   }
4325 
4326   // Extract the possibly padded result. Use a scratch register if we need to do
4327   // a final bitcast, otherwise use the original result register.
4328   Register MergeDstReg;
4329   for (int I = 0; I != NumDefOps; ++I) {
4330     if (IsNarrow && DstTy.isVector())
4331       MergeDstReg = MRI.createGenericVirtualRegister(DstScalarTy);
4332     else
4333       MergeDstReg = DstRegs[I];
4334 
4335     buildWidenedRemergeToDst(MergeDstReg, DstLCMTy, ResultRegs[I]);
4336 
4337     // Recast to vector if we narrowed a vector
4338     if (IsNarrow && DstTy.isVector())
4339       MIRBuilder.buildBitcast(DstRegs[I], MergeDstReg);
4340   }
4341 
4342   MI.eraseFromParent();
4343   return Legalized;
4344 }
4345 
4346 LegalizerHelper::LegalizeResult
4347 LegalizerHelper::fewerElementsVectorSextInReg(MachineInstr &MI, unsigned TypeIdx,
4348                                               LLT NarrowTy) {
4349   Register DstReg = MI.getOperand(0).getReg();
4350   Register SrcReg = MI.getOperand(1).getReg();
4351   int64_t Imm = MI.getOperand(2).getImm();
4352 
4353   LLT DstTy = MRI.getType(DstReg);
4354 
4355   SmallVector<Register, 8> Parts;
4356   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
4357   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts);
4358 
4359   for (Register &R : Parts)
4360     R = MIRBuilder.buildSExtInReg(NarrowTy, R, Imm).getReg(0);
4361 
4362   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4363 
4364   MI.eraseFromParent();
4365   return Legalized;
4366 }
4367 
4368 LegalizerHelper::LegalizeResult
4369 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4370                                      LLT NarrowTy) {
4371   using namespace TargetOpcode;
4372 
4373   switch (MI.getOpcode()) {
4374   case G_IMPLICIT_DEF:
4375     return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy);
4376   case G_TRUNC:
4377   case G_AND:
4378   case G_OR:
4379   case G_XOR:
4380   case G_ADD:
4381   case G_SUB:
4382   case G_MUL:
4383   case G_PTR_ADD:
4384   case G_SMULH:
4385   case G_UMULH:
4386   case G_FADD:
4387   case G_FMUL:
4388   case G_FSUB:
4389   case G_FNEG:
4390   case G_FABS:
4391   case G_FCANONICALIZE:
4392   case G_FDIV:
4393   case G_FREM:
4394   case G_FMA:
4395   case G_FMAD:
4396   case G_FPOW:
4397   case G_FEXP:
4398   case G_FEXP2:
4399   case G_FLOG:
4400   case G_FLOG2:
4401   case G_FLOG10:
4402   case G_FNEARBYINT:
4403   case G_FCEIL:
4404   case G_FFLOOR:
4405   case G_FRINT:
4406   case G_INTRINSIC_ROUND:
4407   case G_INTRINSIC_ROUNDEVEN:
4408   case G_INTRINSIC_TRUNC:
4409   case G_FCOS:
4410   case G_FSIN:
4411   case G_FSQRT:
4412   case G_BSWAP:
4413   case G_BITREVERSE:
4414   case G_SDIV:
4415   case G_UDIV:
4416   case G_SREM:
4417   case G_UREM:
4418   case G_SDIVREM:
4419   case G_UDIVREM:
4420   case G_SMIN:
4421   case G_SMAX:
4422   case G_UMIN:
4423   case G_UMAX:
4424   case G_ABS:
4425   case G_FMINNUM:
4426   case G_FMAXNUM:
4427   case G_FMINNUM_IEEE:
4428   case G_FMAXNUM_IEEE:
4429   case G_FMINIMUM:
4430   case G_FMAXIMUM:
4431   case G_FSHL:
4432   case G_FSHR:
4433   case G_FREEZE:
4434   case G_SADDSAT:
4435   case G_SSUBSAT:
4436   case G_UADDSAT:
4437   case G_USUBSAT:
4438     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
4439   case G_UMULO:
4440   case G_SMULO:
4441     return fewerElementsVectorMulo(MI, TypeIdx, NarrowTy);
4442   case G_SHL:
4443   case G_LSHR:
4444   case G_ASHR:
4445   case G_SSHLSAT:
4446   case G_USHLSAT:
4447   case G_CTLZ:
4448   case G_CTLZ_ZERO_UNDEF:
4449   case G_CTTZ:
4450   case G_CTTZ_ZERO_UNDEF:
4451   case G_CTPOP:
4452   case G_FCOPYSIGN:
4453     return fewerElementsVectorMultiEltType(MI, TypeIdx, NarrowTy);
4454   case G_ZEXT:
4455   case G_SEXT:
4456   case G_ANYEXT:
4457   case G_FPEXT:
4458   case G_FPTRUNC:
4459   case G_SITOFP:
4460   case G_UITOFP:
4461   case G_FPTOSI:
4462   case G_FPTOUI:
4463   case G_INTTOPTR:
4464   case G_PTRTOINT:
4465   case G_ADDRSPACE_CAST:
4466     return fewerElementsVectorCasts(MI, TypeIdx, NarrowTy);
4467   case G_ICMP:
4468   case G_FCMP:
4469     return fewerElementsVectorCmp(MI, TypeIdx, NarrowTy);
4470   case G_SELECT:
4471     return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy);
4472   case G_PHI:
4473     return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy);
4474   case G_UNMERGE_VALUES:
4475     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4476   case G_BUILD_VECTOR:
4477     assert(TypeIdx == 0 && "not a vector type index");
4478     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4479   case G_CONCAT_VECTORS:
4480     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4481       return UnableToLegalize;
4482     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4483   case G_EXTRACT_VECTOR_ELT:
4484   case G_INSERT_VECTOR_ELT:
4485     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4486   case G_LOAD:
4487   case G_STORE:
4488     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4489   case G_SEXT_INREG:
4490     return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy);
4491   GISEL_VECREDUCE_CASES_NONSEQ
4492     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4493   case G_SHUFFLE_VECTOR:
4494     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4495   default:
4496     return UnableToLegalize;
4497   }
4498 }
4499 
4500 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4501     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4502   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4503   if (TypeIdx != 0)
4504     return UnableToLegalize;
4505 
4506   Register DstReg = MI.getOperand(0).getReg();
4507   Register Src1Reg = MI.getOperand(1).getReg();
4508   Register Src2Reg = MI.getOperand(2).getReg();
4509   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4510   LLT DstTy = MRI.getType(DstReg);
4511   LLT Src1Ty = MRI.getType(Src1Reg);
4512   LLT Src2Ty = MRI.getType(Src2Reg);
4513   // The shuffle should be canonicalized by now.
4514   if (DstTy != Src1Ty)
4515     return UnableToLegalize;
4516   if (DstTy != Src2Ty)
4517     return UnableToLegalize;
4518 
4519   if (!isPowerOf2_32(DstTy.getNumElements()))
4520     return UnableToLegalize;
4521 
4522   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4523   // Further legalization attempts will be needed to do split further.
4524   NarrowTy =
4525       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4526   unsigned NewElts = NarrowTy.getNumElements();
4527 
4528   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4529   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4530   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4531   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4532                         SplitSrc2Regs[1]};
4533 
4534   Register Hi, Lo;
4535 
4536   // If Lo or Hi uses elements from at most two of the four input vectors, then
4537   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4538   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4539   SmallVector<int, 16> Ops;
4540   for (unsigned High = 0; High < 2; ++High) {
4541     Register &Output = High ? Hi : Lo;
4542 
4543     // Build a shuffle mask for the output, discovering on the fly which
4544     // input vectors to use as shuffle operands (recorded in InputUsed).
4545     // If building a suitable shuffle vector proves too hard, then bail
4546     // out with useBuildVector set.
4547     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4548     unsigned FirstMaskIdx = High * NewElts;
4549     bool UseBuildVector = false;
4550     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4551       // The mask element.  This indexes into the input.
4552       int Idx = Mask[FirstMaskIdx + MaskOffset];
4553 
4554       // The input vector this mask element indexes into.
4555       unsigned Input = (unsigned)Idx / NewElts;
4556 
4557       if (Input >= array_lengthof(Inputs)) {
4558         // The mask element does not index into any input vector.
4559         Ops.push_back(-1);
4560         continue;
4561       }
4562 
4563       // Turn the index into an offset from the start of the input vector.
4564       Idx -= Input * NewElts;
4565 
4566       // Find or create a shuffle vector operand to hold this input.
4567       unsigned OpNo;
4568       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4569         if (InputUsed[OpNo] == Input) {
4570           // This input vector is already an operand.
4571           break;
4572         } else if (InputUsed[OpNo] == -1U) {
4573           // Create a new operand for this input vector.
4574           InputUsed[OpNo] = Input;
4575           break;
4576         }
4577       }
4578 
4579       if (OpNo >= array_lengthof(InputUsed)) {
4580         // More than two input vectors used!  Give up on trying to create a
4581         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4582         UseBuildVector = true;
4583         break;
4584       }
4585 
4586       // Add the mask index for the new shuffle vector.
4587       Ops.push_back(Idx + OpNo * NewElts);
4588     }
4589 
4590     if (UseBuildVector) {
4591       LLT EltTy = NarrowTy.getElementType();
4592       SmallVector<Register, 16> SVOps;
4593 
4594       // Extract the input elements by hand.
4595       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4596         // The mask element.  This indexes into the input.
4597         int Idx = Mask[FirstMaskIdx + MaskOffset];
4598 
4599         // The input vector this mask element indexes into.
4600         unsigned Input = (unsigned)Idx / NewElts;
4601 
4602         if (Input >= array_lengthof(Inputs)) {
4603           // The mask element is "undef" or indexes off the end of the input.
4604           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4605           continue;
4606         }
4607 
4608         // Turn the index into an offset from the start of the input vector.
4609         Idx -= Input * NewElts;
4610 
4611         // Extract the vector element by hand.
4612         SVOps.push_back(MIRBuilder
4613                             .buildExtractVectorElement(
4614                                 EltTy, Inputs[Input],
4615                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4616                             .getReg(0));
4617       }
4618 
4619       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4620       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4621     } else if (InputUsed[0] == -1U) {
4622       // No input vectors were used! The result is undefined.
4623       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4624     } else {
4625       Register Op0 = Inputs[InputUsed[0]];
4626       // If only one input was used, use an undefined vector for the other.
4627       Register Op1 = InputUsed[1] == -1U
4628                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4629                          : Inputs[InputUsed[1]];
4630       // At least one input vector was used. Create a new shuffle vector.
4631       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4632     }
4633 
4634     Ops.clear();
4635   }
4636 
4637   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4638   MI.eraseFromParent();
4639   return Legalized;
4640 }
4641 
4642 static unsigned getScalarOpcForReduction(unsigned Opc) {
4643   unsigned ScalarOpc;
4644   switch (Opc) {
4645   case TargetOpcode::G_VECREDUCE_FADD:
4646     ScalarOpc = TargetOpcode::G_FADD;
4647     break;
4648   case TargetOpcode::G_VECREDUCE_FMUL:
4649     ScalarOpc = TargetOpcode::G_FMUL;
4650     break;
4651   case TargetOpcode::G_VECREDUCE_FMAX:
4652     ScalarOpc = TargetOpcode::G_FMAXNUM;
4653     break;
4654   case TargetOpcode::G_VECREDUCE_FMIN:
4655     ScalarOpc = TargetOpcode::G_FMINNUM;
4656     break;
4657   case TargetOpcode::G_VECREDUCE_ADD:
4658     ScalarOpc = TargetOpcode::G_ADD;
4659     break;
4660   case TargetOpcode::G_VECREDUCE_MUL:
4661     ScalarOpc = TargetOpcode::G_MUL;
4662     break;
4663   case TargetOpcode::G_VECREDUCE_AND:
4664     ScalarOpc = TargetOpcode::G_AND;
4665     break;
4666   case TargetOpcode::G_VECREDUCE_OR:
4667     ScalarOpc = TargetOpcode::G_OR;
4668     break;
4669   case TargetOpcode::G_VECREDUCE_XOR:
4670     ScalarOpc = TargetOpcode::G_XOR;
4671     break;
4672   case TargetOpcode::G_VECREDUCE_SMAX:
4673     ScalarOpc = TargetOpcode::G_SMAX;
4674     break;
4675   case TargetOpcode::G_VECREDUCE_SMIN:
4676     ScalarOpc = TargetOpcode::G_SMIN;
4677     break;
4678   case TargetOpcode::G_VECREDUCE_UMAX:
4679     ScalarOpc = TargetOpcode::G_UMAX;
4680     break;
4681   case TargetOpcode::G_VECREDUCE_UMIN:
4682     ScalarOpc = TargetOpcode::G_UMIN;
4683     break;
4684   default:
4685     llvm_unreachable("Unhandled reduction");
4686   }
4687   return ScalarOpc;
4688 }
4689 
4690 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4691     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4692   unsigned Opc = MI.getOpcode();
4693   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4694          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4695          "Sequential reductions not expected");
4696 
4697   if (TypeIdx != 1)
4698     return UnableToLegalize;
4699 
4700   // The semantics of the normal non-sequential reductions allow us to freely
4701   // re-associate the operation.
4702   Register SrcReg = MI.getOperand(1).getReg();
4703   LLT SrcTy = MRI.getType(SrcReg);
4704   Register DstReg = MI.getOperand(0).getReg();
4705   LLT DstTy = MRI.getType(DstReg);
4706 
4707   if (NarrowTy.isVector() &&
4708       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4709     return UnableToLegalize;
4710 
4711   unsigned ScalarOpc = getScalarOpcForReduction(Opc);
4712   SmallVector<Register> SplitSrcs;
4713   // If NarrowTy is a scalar then we're being asked to scalarize.
4714   const unsigned NumParts =
4715       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4716                           : SrcTy.getNumElements();
4717 
4718   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4719   if (NarrowTy.isScalar()) {
4720     if (DstTy != NarrowTy)
4721       return UnableToLegalize; // FIXME: handle implicit extensions.
4722 
4723     if (isPowerOf2_32(NumParts)) {
4724       // Generate a tree of scalar operations to reduce the critical path.
4725       SmallVector<Register> PartialResults;
4726       unsigned NumPartsLeft = NumParts;
4727       while (NumPartsLeft > 1) {
4728         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4729           PartialResults.emplace_back(
4730               MIRBuilder
4731                   .buildInstr(ScalarOpc, {NarrowTy},
4732                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4733                   .getReg(0));
4734         }
4735         SplitSrcs = PartialResults;
4736         PartialResults.clear();
4737         NumPartsLeft = SplitSrcs.size();
4738       }
4739       assert(SplitSrcs.size() == 1);
4740       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4741       MI.eraseFromParent();
4742       return Legalized;
4743     }
4744     // If we can't generate a tree, then just do sequential operations.
4745     Register Acc = SplitSrcs[0];
4746     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4747       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4748                 .getReg(0);
4749     MIRBuilder.buildCopy(DstReg, Acc);
4750     MI.eraseFromParent();
4751     return Legalized;
4752   }
4753   SmallVector<Register> PartialReductions;
4754   for (unsigned Part = 0; Part < NumParts; ++Part) {
4755     PartialReductions.push_back(
4756         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4757   }
4758 
4759 
4760   // If the types involved are powers of 2, we can generate intermediate vector
4761   // ops, before generating a final reduction operation.
4762   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4763       isPowerOf2_32(NarrowTy.getNumElements())) {
4764     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4765   }
4766 
4767   Register Acc = PartialReductions[0];
4768   for (unsigned Part = 1; Part < NumParts; ++Part) {
4769     if (Part == NumParts - 1) {
4770       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4771                             {Acc, PartialReductions[Part]});
4772     } else {
4773       Acc = MIRBuilder
4774                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4775                 .getReg(0);
4776     }
4777   }
4778   MI.eraseFromParent();
4779   return Legalized;
4780 }
4781 
4782 LegalizerHelper::LegalizeResult
4783 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4784                                         LLT SrcTy, LLT NarrowTy,
4785                                         unsigned ScalarOpc) {
4786   SmallVector<Register> SplitSrcs;
4787   // Split the sources into NarrowTy size pieces.
4788   extractParts(SrcReg, NarrowTy,
4789                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4790   // We're going to do a tree reduction using vector operations until we have
4791   // one NarrowTy size value left.
4792   while (SplitSrcs.size() > 1) {
4793     SmallVector<Register> PartialRdxs;
4794     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4795       Register LHS = SplitSrcs[Idx];
4796       Register RHS = SplitSrcs[Idx + 1];
4797       // Create the intermediate vector op.
4798       Register Res =
4799           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4800       PartialRdxs.push_back(Res);
4801     }
4802     SplitSrcs = std::move(PartialRdxs);
4803   }
4804   // Finally generate the requested NarrowTy based reduction.
4805   Observer.changingInstr(MI);
4806   MI.getOperand(1).setReg(SplitSrcs[0]);
4807   Observer.changedInstr(MI);
4808   return Legalized;
4809 }
4810 
4811 LegalizerHelper::LegalizeResult
4812 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4813                                              const LLT HalfTy, const LLT AmtTy) {
4814 
4815   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4816   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4817   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4818 
4819   if (Amt.isNullValue()) {
4820     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4821     MI.eraseFromParent();
4822     return Legalized;
4823   }
4824 
4825   LLT NVT = HalfTy;
4826   unsigned NVTBits = HalfTy.getSizeInBits();
4827   unsigned VTBits = 2 * NVTBits;
4828 
4829   SrcOp Lo(Register(0)), Hi(Register(0));
4830   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4831     if (Amt.ugt(VTBits)) {
4832       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4833     } else if (Amt.ugt(NVTBits)) {
4834       Lo = MIRBuilder.buildConstant(NVT, 0);
4835       Hi = MIRBuilder.buildShl(NVT, InL,
4836                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4837     } else if (Amt == NVTBits) {
4838       Lo = MIRBuilder.buildConstant(NVT, 0);
4839       Hi = InL;
4840     } else {
4841       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4842       auto OrLHS =
4843           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4844       auto OrRHS = MIRBuilder.buildLShr(
4845           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4846       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4847     }
4848   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4849     if (Amt.ugt(VTBits)) {
4850       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4851     } else if (Amt.ugt(NVTBits)) {
4852       Lo = MIRBuilder.buildLShr(NVT, InH,
4853                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4854       Hi = MIRBuilder.buildConstant(NVT, 0);
4855     } else if (Amt == NVTBits) {
4856       Lo = InH;
4857       Hi = MIRBuilder.buildConstant(NVT, 0);
4858     } else {
4859       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4860 
4861       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4862       auto OrRHS = MIRBuilder.buildShl(
4863           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4864 
4865       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4866       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4867     }
4868   } else {
4869     if (Amt.ugt(VTBits)) {
4870       Hi = Lo = MIRBuilder.buildAShr(
4871           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4872     } else if (Amt.ugt(NVTBits)) {
4873       Lo = MIRBuilder.buildAShr(NVT, InH,
4874                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4875       Hi = MIRBuilder.buildAShr(NVT, InH,
4876                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4877     } else if (Amt == NVTBits) {
4878       Lo = InH;
4879       Hi = MIRBuilder.buildAShr(NVT, InH,
4880                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4881     } else {
4882       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4883 
4884       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4885       auto OrRHS = MIRBuilder.buildShl(
4886           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4887 
4888       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4889       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4890     }
4891   }
4892 
4893   MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4894   MI.eraseFromParent();
4895 
4896   return Legalized;
4897 }
4898 
4899 // TODO: Optimize if constant shift amount.
4900 LegalizerHelper::LegalizeResult
4901 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4902                                    LLT RequestedTy) {
4903   if (TypeIdx == 1) {
4904     Observer.changingInstr(MI);
4905     narrowScalarSrc(MI, RequestedTy, 2);
4906     Observer.changedInstr(MI);
4907     return Legalized;
4908   }
4909 
4910   Register DstReg = MI.getOperand(0).getReg();
4911   LLT DstTy = MRI.getType(DstReg);
4912   if (DstTy.isVector())
4913     return UnableToLegalize;
4914 
4915   Register Amt = MI.getOperand(2).getReg();
4916   LLT ShiftAmtTy = MRI.getType(Amt);
4917   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4918   if (DstEltSize % 2 != 0)
4919     return UnableToLegalize;
4920 
4921   // Ignore the input type. We can only go to exactly half the size of the
4922   // input. If that isn't small enough, the resulting pieces will be further
4923   // legalized.
4924   const unsigned NewBitSize = DstEltSize / 2;
4925   const LLT HalfTy = LLT::scalar(NewBitSize);
4926   const LLT CondTy = LLT::scalar(1);
4927 
4928   if (auto VRegAndVal =
4929           getConstantVRegValWithLookThrough(Amt, MRI, true, false)) {
4930     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4931                                        ShiftAmtTy);
4932   }
4933 
4934   // TODO: Expand with known bits.
4935 
4936   // Handle the fully general expansion by an unknown amount.
4937   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4938 
4939   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4940   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4941   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4942 
4943   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4944   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4945 
4946   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4947   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4948   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4949 
4950   Register ResultRegs[2];
4951   switch (MI.getOpcode()) {
4952   case TargetOpcode::G_SHL: {
4953     // Short: ShAmt < NewBitSize
4954     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4955 
4956     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4957     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4958     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4959 
4960     // Long: ShAmt >= NewBitSize
4961     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4962     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4963 
4964     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4965     auto Hi = MIRBuilder.buildSelect(
4966         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4967 
4968     ResultRegs[0] = Lo.getReg(0);
4969     ResultRegs[1] = Hi.getReg(0);
4970     break;
4971   }
4972   case TargetOpcode::G_LSHR:
4973   case TargetOpcode::G_ASHR: {
4974     // Short: ShAmt < NewBitSize
4975     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4976 
4977     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4978     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4979     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4980 
4981     // Long: ShAmt >= NewBitSize
4982     MachineInstrBuilder HiL;
4983     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4984       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4985     } else {
4986       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4987       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4988     }
4989     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4990                                      {InH, AmtExcess});     // Lo from Hi part.
4991 
4992     auto Lo = MIRBuilder.buildSelect(
4993         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4994 
4995     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4996 
4997     ResultRegs[0] = Lo.getReg(0);
4998     ResultRegs[1] = Hi.getReg(0);
4999     break;
5000   }
5001   default:
5002     llvm_unreachable("not a shift");
5003   }
5004 
5005   MIRBuilder.buildMerge(DstReg, ResultRegs);
5006   MI.eraseFromParent();
5007   return Legalized;
5008 }
5009 
5010 LegalizerHelper::LegalizeResult
5011 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
5012                                        LLT MoreTy) {
5013   assert(TypeIdx == 0 && "Expecting only Idx 0");
5014 
5015   Observer.changingInstr(MI);
5016   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5017     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
5018     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
5019     moreElementsVectorSrc(MI, MoreTy, I);
5020   }
5021 
5022   MachineBasicBlock &MBB = *MI.getParent();
5023   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
5024   moreElementsVectorDst(MI, MoreTy, 0);
5025   Observer.changedInstr(MI);
5026   return Legalized;
5027 }
5028 
5029 LegalizerHelper::LegalizeResult
5030 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
5031                                     LLT MoreTy) {
5032   unsigned Opc = MI.getOpcode();
5033   switch (Opc) {
5034   case TargetOpcode::G_IMPLICIT_DEF:
5035   case TargetOpcode::G_LOAD: {
5036     if (TypeIdx != 0)
5037       return UnableToLegalize;
5038     Observer.changingInstr(MI);
5039     moreElementsVectorDst(MI, MoreTy, 0);
5040     Observer.changedInstr(MI);
5041     return Legalized;
5042   }
5043   case TargetOpcode::G_STORE:
5044     if (TypeIdx != 0)
5045       return UnableToLegalize;
5046     Observer.changingInstr(MI);
5047     moreElementsVectorSrc(MI, MoreTy, 0);
5048     Observer.changedInstr(MI);
5049     return Legalized;
5050   case TargetOpcode::G_AND:
5051   case TargetOpcode::G_OR:
5052   case TargetOpcode::G_XOR:
5053   case TargetOpcode::G_SMIN:
5054   case TargetOpcode::G_SMAX:
5055   case TargetOpcode::G_UMIN:
5056   case TargetOpcode::G_UMAX:
5057   case TargetOpcode::G_FMINNUM:
5058   case TargetOpcode::G_FMAXNUM:
5059   case TargetOpcode::G_FMINNUM_IEEE:
5060   case TargetOpcode::G_FMAXNUM_IEEE:
5061   case TargetOpcode::G_FMINIMUM:
5062   case TargetOpcode::G_FMAXIMUM: {
5063     Observer.changingInstr(MI);
5064     moreElementsVectorSrc(MI, MoreTy, 1);
5065     moreElementsVectorSrc(MI, MoreTy, 2);
5066     moreElementsVectorDst(MI, MoreTy, 0);
5067     Observer.changedInstr(MI);
5068     return Legalized;
5069   }
5070   case TargetOpcode::G_EXTRACT:
5071     if (TypeIdx != 1)
5072       return UnableToLegalize;
5073     Observer.changingInstr(MI);
5074     moreElementsVectorSrc(MI, MoreTy, 1);
5075     Observer.changedInstr(MI);
5076     return Legalized;
5077   case TargetOpcode::G_INSERT:
5078   case TargetOpcode::G_FREEZE:
5079     if (TypeIdx != 0)
5080       return UnableToLegalize;
5081     Observer.changingInstr(MI);
5082     moreElementsVectorSrc(MI, MoreTy, 1);
5083     moreElementsVectorDst(MI, MoreTy, 0);
5084     Observer.changedInstr(MI);
5085     return Legalized;
5086   case TargetOpcode::G_SELECT:
5087     if (TypeIdx != 0)
5088       return UnableToLegalize;
5089     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
5090       return UnableToLegalize;
5091 
5092     Observer.changingInstr(MI);
5093     moreElementsVectorSrc(MI, MoreTy, 2);
5094     moreElementsVectorSrc(MI, MoreTy, 3);
5095     moreElementsVectorDst(MI, MoreTy, 0);
5096     Observer.changedInstr(MI);
5097     return Legalized;
5098   case TargetOpcode::G_UNMERGE_VALUES: {
5099     if (TypeIdx != 1)
5100       return UnableToLegalize;
5101 
5102     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
5103     int NumDst = MI.getNumOperands() - 1;
5104     moreElementsVectorSrc(MI, MoreTy, NumDst);
5105 
5106     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
5107     for (int I = 0; I != NumDst; ++I)
5108       MIB.addDef(MI.getOperand(I).getReg());
5109 
5110     int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits();
5111     for (int I = NumDst; I != NewNumDst; ++I)
5112       MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
5113 
5114     MIB.addUse(MI.getOperand(NumDst).getReg());
5115     MI.eraseFromParent();
5116     return Legalized;
5117   }
5118   case TargetOpcode::G_PHI:
5119     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
5120   case TargetOpcode::G_SHUFFLE_VECTOR:
5121     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
5122   default:
5123     return UnableToLegalize;
5124   }
5125 }
5126 
5127 LegalizerHelper::LegalizeResult
5128 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
5129                                            unsigned int TypeIdx, LLT MoreTy) {
5130   if (TypeIdx != 0)
5131     return UnableToLegalize;
5132 
5133   Register DstReg = MI.getOperand(0).getReg();
5134   Register Src1Reg = MI.getOperand(1).getReg();
5135   Register Src2Reg = MI.getOperand(2).getReg();
5136   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5137   LLT DstTy = MRI.getType(DstReg);
5138   LLT Src1Ty = MRI.getType(Src1Reg);
5139   LLT Src2Ty = MRI.getType(Src2Reg);
5140   unsigned NumElts = DstTy.getNumElements();
5141   unsigned WidenNumElts = MoreTy.getNumElements();
5142 
5143   // Expect a canonicalized shuffle.
5144   if (DstTy != Src1Ty || DstTy != Src2Ty)
5145     return UnableToLegalize;
5146 
5147   moreElementsVectorSrc(MI, MoreTy, 1);
5148   moreElementsVectorSrc(MI, MoreTy, 2);
5149 
5150   // Adjust mask based on new input vector length.
5151   SmallVector<int, 16> NewMask;
5152   for (unsigned I = 0; I != NumElts; ++I) {
5153     int Idx = Mask[I];
5154     if (Idx < static_cast<int>(NumElts))
5155       NewMask.push_back(Idx);
5156     else
5157       NewMask.push_back(Idx - NumElts + WidenNumElts);
5158   }
5159   for (unsigned I = NumElts; I != WidenNumElts; ++I)
5160     NewMask.push_back(-1);
5161   moreElementsVectorDst(MI, MoreTy, 0);
5162   MIRBuilder.setInstrAndDebugLoc(MI);
5163   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5164                                 MI.getOperand(1).getReg(),
5165                                 MI.getOperand(2).getReg(), NewMask);
5166   MI.eraseFromParent();
5167   return Legalized;
5168 }
5169 
5170 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
5171                                         ArrayRef<Register> Src1Regs,
5172                                         ArrayRef<Register> Src2Regs,
5173                                         LLT NarrowTy) {
5174   MachineIRBuilder &B = MIRBuilder;
5175   unsigned SrcParts = Src1Regs.size();
5176   unsigned DstParts = DstRegs.size();
5177 
5178   unsigned DstIdx = 0; // Low bits of the result.
5179   Register FactorSum =
5180       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
5181   DstRegs[DstIdx] = FactorSum;
5182 
5183   unsigned CarrySumPrevDstIdx;
5184   SmallVector<Register, 4> Factors;
5185 
5186   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
5187     // Collect low parts of muls for DstIdx.
5188     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5189          i <= std::min(DstIdx, SrcParts - 1); ++i) {
5190       MachineInstrBuilder Mul =
5191           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
5192       Factors.push_back(Mul.getReg(0));
5193     }
5194     // Collect high parts of muls from previous DstIdx.
5195     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5196          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5197       MachineInstrBuilder Umulh =
5198           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5199       Factors.push_back(Umulh.getReg(0));
5200     }
5201     // Add CarrySum from additions calculated for previous DstIdx.
5202     if (DstIdx != 1) {
5203       Factors.push_back(CarrySumPrevDstIdx);
5204     }
5205 
5206     Register CarrySum;
5207     // Add all factors and accumulate all carries into CarrySum.
5208     if (DstIdx != DstParts - 1) {
5209       MachineInstrBuilder Uaddo =
5210           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5211       FactorSum = Uaddo.getReg(0);
5212       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5213       for (unsigned i = 2; i < Factors.size(); ++i) {
5214         MachineInstrBuilder Uaddo =
5215             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5216         FactorSum = Uaddo.getReg(0);
5217         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5218         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5219       }
5220     } else {
5221       // Since value for the next index is not calculated, neither is CarrySum.
5222       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5223       for (unsigned i = 2; i < Factors.size(); ++i)
5224         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5225     }
5226 
5227     CarrySumPrevDstIdx = CarrySum;
5228     DstRegs[DstIdx] = FactorSum;
5229     Factors.clear();
5230   }
5231 }
5232 
5233 LegalizerHelper::LegalizeResult
5234 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5235                                     LLT NarrowTy) {
5236   if (TypeIdx != 0)
5237     return UnableToLegalize;
5238 
5239   Register DstReg = MI.getOperand(0).getReg();
5240   LLT DstType = MRI.getType(DstReg);
5241   // FIXME: add support for vector types
5242   if (DstType.isVector())
5243     return UnableToLegalize;
5244 
5245   unsigned Opcode = MI.getOpcode();
5246   unsigned OpO, OpE, OpF;
5247   switch (Opcode) {
5248   case TargetOpcode::G_SADDO:
5249   case TargetOpcode::G_SADDE:
5250   case TargetOpcode::G_UADDO:
5251   case TargetOpcode::G_UADDE:
5252   case TargetOpcode::G_ADD:
5253     OpO = TargetOpcode::G_UADDO;
5254     OpE = TargetOpcode::G_UADDE;
5255     OpF = TargetOpcode::G_UADDE;
5256     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5257       OpF = TargetOpcode::G_SADDE;
5258     break;
5259   case TargetOpcode::G_SSUBO:
5260   case TargetOpcode::G_SSUBE:
5261   case TargetOpcode::G_USUBO:
5262   case TargetOpcode::G_USUBE:
5263   case TargetOpcode::G_SUB:
5264     OpO = TargetOpcode::G_USUBO;
5265     OpE = TargetOpcode::G_USUBE;
5266     OpF = TargetOpcode::G_USUBE;
5267     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5268       OpF = TargetOpcode::G_SSUBE;
5269     break;
5270   default:
5271     llvm_unreachable("Unexpected add/sub opcode!");
5272   }
5273 
5274   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5275   unsigned NumDefs = MI.getNumExplicitDefs();
5276   Register Src1 = MI.getOperand(NumDefs).getReg();
5277   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5278   Register CarryDst, CarryIn;
5279   if (NumDefs == 2)
5280     CarryDst = MI.getOperand(1).getReg();
5281   if (MI.getNumOperands() == NumDefs + 3)
5282     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5283 
5284   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5285   LLT LeftoverTy, DummyTy;
5286   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5287   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5288   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5289 
5290   int NarrowParts = Src1Regs.size();
5291   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5292     Src1Regs.push_back(Src1Left[I]);
5293     Src2Regs.push_back(Src2Left[I]);
5294   }
5295   DstRegs.reserve(Src1Regs.size());
5296 
5297   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5298     Register DstReg =
5299         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5300     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5301     // Forward the final carry-out to the destination register
5302     if (i == e - 1 && CarryDst)
5303       CarryOut = CarryDst;
5304 
5305     if (!CarryIn) {
5306       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5307                             {Src1Regs[i], Src2Regs[i]});
5308     } else if (i == e - 1) {
5309       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5310                             {Src1Regs[i], Src2Regs[i], CarryIn});
5311     } else {
5312       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5313                             {Src1Regs[i], Src2Regs[i], CarryIn});
5314     }
5315 
5316     DstRegs.push_back(DstReg);
5317     CarryIn = CarryOut;
5318   }
5319   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5320               makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5321               makeArrayRef(DstRegs).drop_front(NarrowParts));
5322 
5323   MI.eraseFromParent();
5324   return Legalized;
5325 }
5326 
5327 LegalizerHelper::LegalizeResult
5328 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5329   Register DstReg = MI.getOperand(0).getReg();
5330   Register Src1 = MI.getOperand(1).getReg();
5331   Register Src2 = MI.getOperand(2).getReg();
5332 
5333   LLT Ty = MRI.getType(DstReg);
5334   if (Ty.isVector())
5335     return UnableToLegalize;
5336 
5337   unsigned SrcSize = MRI.getType(Src1).getSizeInBits();
5338   unsigned DstSize = Ty.getSizeInBits();
5339   unsigned NarrowSize = NarrowTy.getSizeInBits();
5340   if (DstSize % NarrowSize != 0 || SrcSize % NarrowSize != 0)
5341     return UnableToLegalize;
5342 
5343   unsigned NumDstParts = DstSize / NarrowSize;
5344   unsigned NumSrcParts = SrcSize / NarrowSize;
5345   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5346   unsigned DstTmpParts = NumDstParts * (IsMulHigh ? 2 : 1);
5347 
5348   SmallVector<Register, 2> Src1Parts, Src2Parts;
5349   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5350   extractParts(Src1, NarrowTy, NumSrcParts, Src1Parts);
5351   extractParts(Src2, NarrowTy, NumSrcParts, Src2Parts);
5352   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5353 
5354   // Take only high half of registers if this is high mul.
5355   ArrayRef<Register> DstRegs(
5356       IsMulHigh ? &DstTmpRegs[DstTmpParts / 2] : &DstTmpRegs[0], NumDstParts);
5357   MIRBuilder.buildMerge(DstReg, DstRegs);
5358   MI.eraseFromParent();
5359   return Legalized;
5360 }
5361 
5362 LegalizerHelper::LegalizeResult
5363 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5364                                    LLT NarrowTy) {
5365   if (TypeIdx != 0)
5366     return UnableToLegalize;
5367 
5368   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5369 
5370   Register Src = MI.getOperand(1).getReg();
5371   LLT SrcTy = MRI.getType(Src);
5372 
5373   // If all finite floats fit into the narrowed integer type, we can just swap
5374   // out the result type. This is practically only useful for conversions from
5375   // half to at least 16-bits, so just handle the one case.
5376   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5377       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5378     return UnableToLegalize;
5379 
5380   Observer.changingInstr(MI);
5381   narrowScalarDst(MI, NarrowTy, 0,
5382                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5383   Observer.changedInstr(MI);
5384   return Legalized;
5385 }
5386 
5387 LegalizerHelper::LegalizeResult
5388 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5389                                      LLT NarrowTy) {
5390   if (TypeIdx != 1)
5391     return UnableToLegalize;
5392 
5393   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5394 
5395   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5396   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5397   // NarrowSize.
5398   if (SizeOp1 % NarrowSize != 0)
5399     return UnableToLegalize;
5400   int NumParts = SizeOp1 / NarrowSize;
5401 
5402   SmallVector<Register, 2> SrcRegs, DstRegs;
5403   SmallVector<uint64_t, 2> Indexes;
5404   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5405 
5406   Register OpReg = MI.getOperand(0).getReg();
5407   uint64_t OpStart = MI.getOperand(2).getImm();
5408   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5409   for (int i = 0; i < NumParts; ++i) {
5410     unsigned SrcStart = i * NarrowSize;
5411 
5412     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5413       // No part of the extract uses this subregister, ignore it.
5414       continue;
5415     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5416       // The entire subregister is extracted, forward the value.
5417       DstRegs.push_back(SrcRegs[i]);
5418       continue;
5419     }
5420 
5421     // OpSegStart is where this destination segment would start in OpReg if it
5422     // extended infinitely in both directions.
5423     int64_t ExtractOffset;
5424     uint64_t SegSize;
5425     if (OpStart < SrcStart) {
5426       ExtractOffset = 0;
5427       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5428     } else {
5429       ExtractOffset = OpStart - SrcStart;
5430       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5431     }
5432 
5433     Register SegReg = SrcRegs[i];
5434     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5435       // A genuine extract is needed.
5436       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5437       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5438     }
5439 
5440     DstRegs.push_back(SegReg);
5441   }
5442 
5443   Register DstReg = MI.getOperand(0).getReg();
5444   if (MRI.getType(DstReg).isVector())
5445     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5446   else if (DstRegs.size() > 1)
5447     MIRBuilder.buildMerge(DstReg, DstRegs);
5448   else
5449     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5450   MI.eraseFromParent();
5451   return Legalized;
5452 }
5453 
5454 LegalizerHelper::LegalizeResult
5455 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5456                                     LLT NarrowTy) {
5457   // FIXME: Don't know how to handle secondary types yet.
5458   if (TypeIdx != 0)
5459     return UnableToLegalize;
5460 
5461   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5462   SmallVector<uint64_t, 2> Indexes;
5463   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5464   LLT LeftoverTy;
5465   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5466                LeftoverRegs);
5467 
5468   for (Register Reg : LeftoverRegs)
5469     SrcRegs.push_back(Reg);
5470 
5471   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5472   Register OpReg = MI.getOperand(2).getReg();
5473   uint64_t OpStart = MI.getOperand(3).getImm();
5474   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5475   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5476     unsigned DstStart = I * NarrowSize;
5477 
5478     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5479       // The entire subregister is defined by this insert, forward the new
5480       // value.
5481       DstRegs.push_back(OpReg);
5482       continue;
5483     }
5484 
5485     Register SrcReg = SrcRegs[I];
5486     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5487       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5488       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5489       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5490     }
5491 
5492     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5493       // No part of the insert affects this subregister, forward the original.
5494       DstRegs.push_back(SrcReg);
5495       continue;
5496     }
5497 
5498     // OpSegStart is where this destination segment would start in OpReg if it
5499     // extended infinitely in both directions.
5500     int64_t ExtractOffset, InsertOffset;
5501     uint64_t SegSize;
5502     if (OpStart < DstStart) {
5503       InsertOffset = 0;
5504       ExtractOffset = DstStart - OpStart;
5505       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5506     } else {
5507       InsertOffset = OpStart - DstStart;
5508       ExtractOffset = 0;
5509       SegSize =
5510         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5511     }
5512 
5513     Register SegReg = OpReg;
5514     if (ExtractOffset != 0 || SegSize != OpSize) {
5515       // A genuine extract is needed.
5516       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5517       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5518     }
5519 
5520     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5521     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5522     DstRegs.push_back(DstReg);
5523   }
5524 
5525   uint64_t WideSize = DstRegs.size() * NarrowSize;
5526   Register DstReg = MI.getOperand(0).getReg();
5527   if (WideSize > RegTy.getSizeInBits()) {
5528     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5529     MIRBuilder.buildMerge(MergeReg, DstRegs);
5530     MIRBuilder.buildTrunc(DstReg, MergeReg);
5531   } else
5532     MIRBuilder.buildMerge(DstReg, DstRegs);
5533 
5534   MI.eraseFromParent();
5535   return Legalized;
5536 }
5537 
5538 LegalizerHelper::LegalizeResult
5539 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5540                                    LLT NarrowTy) {
5541   Register DstReg = MI.getOperand(0).getReg();
5542   LLT DstTy = MRI.getType(DstReg);
5543 
5544   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5545 
5546   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5547   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5548   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5549   LLT LeftoverTy;
5550   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5551                     Src0Regs, Src0LeftoverRegs))
5552     return UnableToLegalize;
5553 
5554   LLT Unused;
5555   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5556                     Src1Regs, Src1LeftoverRegs))
5557     llvm_unreachable("inconsistent extractParts result");
5558 
5559   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5560     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5561                                         {Src0Regs[I], Src1Regs[I]});
5562     DstRegs.push_back(Inst.getReg(0));
5563   }
5564 
5565   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5566     auto Inst = MIRBuilder.buildInstr(
5567       MI.getOpcode(),
5568       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5569     DstLeftoverRegs.push_back(Inst.getReg(0));
5570   }
5571 
5572   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5573               LeftoverTy, DstLeftoverRegs);
5574 
5575   MI.eraseFromParent();
5576   return Legalized;
5577 }
5578 
5579 LegalizerHelper::LegalizeResult
5580 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5581                                  LLT NarrowTy) {
5582   if (TypeIdx != 0)
5583     return UnableToLegalize;
5584 
5585   Register DstReg = MI.getOperand(0).getReg();
5586   Register SrcReg = MI.getOperand(1).getReg();
5587 
5588   LLT DstTy = MRI.getType(DstReg);
5589   if (DstTy.isVector())
5590     return UnableToLegalize;
5591 
5592   SmallVector<Register, 8> Parts;
5593   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5594   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5595   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5596 
5597   MI.eraseFromParent();
5598   return Legalized;
5599 }
5600 
5601 LegalizerHelper::LegalizeResult
5602 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5603                                     LLT NarrowTy) {
5604   if (TypeIdx != 0)
5605     return UnableToLegalize;
5606 
5607   Register CondReg = MI.getOperand(1).getReg();
5608   LLT CondTy = MRI.getType(CondReg);
5609   if (CondTy.isVector()) // TODO: Handle vselect
5610     return UnableToLegalize;
5611 
5612   Register DstReg = MI.getOperand(0).getReg();
5613   LLT DstTy = MRI.getType(DstReg);
5614 
5615   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5616   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5617   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5618   LLT LeftoverTy;
5619   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5620                     Src1Regs, Src1LeftoverRegs))
5621     return UnableToLegalize;
5622 
5623   LLT Unused;
5624   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5625                     Src2Regs, Src2LeftoverRegs))
5626     llvm_unreachable("inconsistent extractParts result");
5627 
5628   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5629     auto Select = MIRBuilder.buildSelect(NarrowTy,
5630                                          CondReg, Src1Regs[I], Src2Regs[I]);
5631     DstRegs.push_back(Select.getReg(0));
5632   }
5633 
5634   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5635     auto Select = MIRBuilder.buildSelect(
5636       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5637     DstLeftoverRegs.push_back(Select.getReg(0));
5638   }
5639 
5640   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5641               LeftoverTy, DstLeftoverRegs);
5642 
5643   MI.eraseFromParent();
5644   return Legalized;
5645 }
5646 
5647 LegalizerHelper::LegalizeResult
5648 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5649                                   LLT NarrowTy) {
5650   if (TypeIdx != 1)
5651     return UnableToLegalize;
5652 
5653   Register DstReg = MI.getOperand(0).getReg();
5654   Register SrcReg = MI.getOperand(1).getReg();
5655   LLT DstTy = MRI.getType(DstReg);
5656   LLT SrcTy = MRI.getType(SrcReg);
5657   unsigned NarrowSize = NarrowTy.getSizeInBits();
5658 
5659   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5660     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5661 
5662     MachineIRBuilder &B = MIRBuilder;
5663     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5664     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5665     auto C_0 = B.buildConstant(NarrowTy, 0);
5666     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5667                                 UnmergeSrc.getReg(1), C_0);
5668     auto LoCTLZ = IsUndef ?
5669       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5670       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5671     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5672     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5673     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5674     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5675 
5676     MI.eraseFromParent();
5677     return Legalized;
5678   }
5679 
5680   return UnableToLegalize;
5681 }
5682 
5683 LegalizerHelper::LegalizeResult
5684 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5685                                   LLT NarrowTy) {
5686   if (TypeIdx != 1)
5687     return UnableToLegalize;
5688 
5689   Register DstReg = MI.getOperand(0).getReg();
5690   Register SrcReg = MI.getOperand(1).getReg();
5691   LLT DstTy = MRI.getType(DstReg);
5692   LLT SrcTy = MRI.getType(SrcReg);
5693   unsigned NarrowSize = NarrowTy.getSizeInBits();
5694 
5695   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5696     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5697 
5698     MachineIRBuilder &B = MIRBuilder;
5699     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5700     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5701     auto C_0 = B.buildConstant(NarrowTy, 0);
5702     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5703                                 UnmergeSrc.getReg(0), C_0);
5704     auto HiCTTZ = IsUndef ?
5705       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5706       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5707     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5708     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5709     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5710     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5711 
5712     MI.eraseFromParent();
5713     return Legalized;
5714   }
5715 
5716   return UnableToLegalize;
5717 }
5718 
5719 LegalizerHelper::LegalizeResult
5720 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5721                                    LLT NarrowTy) {
5722   if (TypeIdx != 1)
5723     return UnableToLegalize;
5724 
5725   Register DstReg = MI.getOperand(0).getReg();
5726   LLT DstTy = MRI.getType(DstReg);
5727   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5728   unsigned NarrowSize = NarrowTy.getSizeInBits();
5729 
5730   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5731     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5732 
5733     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5734     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5735     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5736 
5737     MI.eraseFromParent();
5738     return Legalized;
5739   }
5740 
5741   return UnableToLegalize;
5742 }
5743 
5744 LegalizerHelper::LegalizeResult
5745 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5746   unsigned Opc = MI.getOpcode();
5747   const auto &TII = MIRBuilder.getTII();
5748   auto isSupported = [this](const LegalityQuery &Q) {
5749     auto QAction = LI.getAction(Q).Action;
5750     return QAction == Legal || QAction == Libcall || QAction == Custom;
5751   };
5752   switch (Opc) {
5753   default:
5754     return UnableToLegalize;
5755   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5756     // This trivially expands to CTLZ.
5757     Observer.changingInstr(MI);
5758     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5759     Observer.changedInstr(MI);
5760     return Legalized;
5761   }
5762   case TargetOpcode::G_CTLZ: {
5763     Register DstReg = MI.getOperand(0).getReg();
5764     Register SrcReg = MI.getOperand(1).getReg();
5765     LLT DstTy = MRI.getType(DstReg);
5766     LLT SrcTy = MRI.getType(SrcReg);
5767     unsigned Len = SrcTy.getSizeInBits();
5768 
5769     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5770       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5771       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5772       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5773       auto ICmp = MIRBuilder.buildICmp(
5774           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5775       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5776       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5777       MI.eraseFromParent();
5778       return Legalized;
5779     }
5780     // for now, we do this:
5781     // NewLen = NextPowerOf2(Len);
5782     // x = x | (x >> 1);
5783     // x = x | (x >> 2);
5784     // ...
5785     // x = x | (x >>16);
5786     // x = x | (x >>32); // for 64-bit input
5787     // Upto NewLen/2
5788     // return Len - popcount(x);
5789     //
5790     // Ref: "Hacker's Delight" by Henry Warren
5791     Register Op = SrcReg;
5792     unsigned NewLen = PowerOf2Ceil(Len);
5793     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5794       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5795       auto MIBOp = MIRBuilder.buildOr(
5796           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5797       Op = MIBOp.getReg(0);
5798     }
5799     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5800     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5801                         MIBPop);
5802     MI.eraseFromParent();
5803     return Legalized;
5804   }
5805   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5806     // This trivially expands to CTTZ.
5807     Observer.changingInstr(MI);
5808     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5809     Observer.changedInstr(MI);
5810     return Legalized;
5811   }
5812   case TargetOpcode::G_CTTZ: {
5813     Register DstReg = MI.getOperand(0).getReg();
5814     Register SrcReg = MI.getOperand(1).getReg();
5815     LLT DstTy = MRI.getType(DstReg);
5816     LLT SrcTy = MRI.getType(SrcReg);
5817 
5818     unsigned Len = SrcTy.getSizeInBits();
5819     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5820       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5821       // zero.
5822       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5823       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5824       auto ICmp = MIRBuilder.buildICmp(
5825           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5826       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5827       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5828       MI.eraseFromParent();
5829       return Legalized;
5830     }
5831     // for now, we use: { return popcount(~x & (x - 1)); }
5832     // unless the target has ctlz but not ctpop, in which case we use:
5833     // { return 32 - nlz(~x & (x-1)); }
5834     // Ref: "Hacker's Delight" by Henry Warren
5835     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5836     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5837     auto MIBTmp = MIRBuilder.buildAnd(
5838         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5839     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5840         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5841       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5842       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5843                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5844       MI.eraseFromParent();
5845       return Legalized;
5846     }
5847     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5848     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5849     return Legalized;
5850   }
5851   case TargetOpcode::G_CTPOP: {
5852     Register SrcReg = MI.getOperand(1).getReg();
5853     LLT Ty = MRI.getType(SrcReg);
5854     unsigned Size = Ty.getSizeInBits();
5855     MachineIRBuilder &B = MIRBuilder;
5856 
5857     // Count set bits in blocks of 2 bits. Default approach would be
5858     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5859     // We use following formula instead:
5860     // B2Count = val - { (val >> 1) & 0x55555555 }
5861     // since it gives same result in blocks of 2 with one instruction less.
5862     auto C_1 = B.buildConstant(Ty, 1);
5863     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5864     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5865     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5866     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5867     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5868 
5869     // In order to get count in blocks of 4 add values from adjacent block of 2.
5870     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5871     auto C_2 = B.buildConstant(Ty, 2);
5872     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5873     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5874     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5875     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5876     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5877     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5878 
5879     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5880     // addition since count value sits in range {0,...,8} and 4 bits are enough
5881     // to hold such binary values. After addition high 4 bits still hold count
5882     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5883     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5884     auto C_4 = B.buildConstant(Ty, 4);
5885     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5886     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5887     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5888     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5889     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5890 
5891     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5892     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5893     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5894     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5895     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5896 
5897     // Shift count result from 8 high bits to low bits.
5898     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5899     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5900 
5901     MI.eraseFromParent();
5902     return Legalized;
5903   }
5904   }
5905 }
5906 
5907 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5908 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5909                                         Register Reg, unsigned BW) {
5910   return matchUnaryPredicate(
5911       MRI, Reg,
5912       [=](const Constant *C) {
5913         // Null constant here means an undef.
5914         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5915         return !CI || CI->getValue().urem(BW) != 0;
5916       },
5917       /*AllowUndefs*/ true);
5918 }
5919 
5920 LegalizerHelper::LegalizeResult
5921 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5922   Register Dst = MI.getOperand(0).getReg();
5923   Register X = MI.getOperand(1).getReg();
5924   Register Y = MI.getOperand(2).getReg();
5925   Register Z = MI.getOperand(3).getReg();
5926   LLT Ty = MRI.getType(Dst);
5927   LLT ShTy = MRI.getType(Z);
5928 
5929   unsigned BW = Ty.getScalarSizeInBits();
5930 
5931   if (!isPowerOf2_32(BW))
5932     return UnableToLegalize;
5933 
5934   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5935   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5936 
5937   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5938     // fshl X, Y, Z -> fshr X, Y, -Z
5939     // fshr X, Y, Z -> fshl X, Y, -Z
5940     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5941     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5942   } else {
5943     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5944     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5945     auto One = MIRBuilder.buildConstant(ShTy, 1);
5946     if (IsFSHL) {
5947       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5948       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5949     } else {
5950       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5951       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5952     }
5953 
5954     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5955   }
5956 
5957   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5958   MI.eraseFromParent();
5959   return Legalized;
5960 }
5961 
5962 LegalizerHelper::LegalizeResult
5963 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5964   Register Dst = MI.getOperand(0).getReg();
5965   Register X = MI.getOperand(1).getReg();
5966   Register Y = MI.getOperand(2).getReg();
5967   Register Z = MI.getOperand(3).getReg();
5968   LLT Ty = MRI.getType(Dst);
5969   LLT ShTy = MRI.getType(Z);
5970 
5971   const unsigned BW = Ty.getScalarSizeInBits();
5972   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5973 
5974   Register ShX, ShY;
5975   Register ShAmt, InvShAmt;
5976 
5977   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5978   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5979     // fshl: X << C | Y >> (BW - C)
5980     // fshr: X << (BW - C) | Y >> C
5981     // where C = Z % BW is not zero
5982     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5983     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5984     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5985     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5986     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5987   } else {
5988     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5989     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5990     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5991     if (isPowerOf2_32(BW)) {
5992       // Z % BW -> Z & (BW - 1)
5993       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5994       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5995       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5996       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
5997     } else {
5998       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5999       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
6000       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
6001     }
6002 
6003     auto One = MIRBuilder.buildConstant(ShTy, 1);
6004     if (IsFSHL) {
6005       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
6006       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
6007       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
6008     } else {
6009       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
6010       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
6011       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
6012     }
6013   }
6014 
6015   MIRBuilder.buildOr(Dst, ShX, ShY);
6016   MI.eraseFromParent();
6017   return Legalized;
6018 }
6019 
6020 LegalizerHelper::LegalizeResult
6021 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
6022   // These operations approximately do the following (while avoiding undefined
6023   // shifts by BW):
6024   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
6025   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
6026   Register Dst = MI.getOperand(0).getReg();
6027   LLT Ty = MRI.getType(Dst);
6028   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
6029 
6030   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6031   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6032 
6033   // TODO: Use smarter heuristic that accounts for vector legalization.
6034   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
6035     return lowerFunnelShiftAsShifts(MI);
6036 
6037   // This only works for powers of 2, fallback to shifts if it fails.
6038   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
6039   if (Result == UnableToLegalize)
6040     return lowerFunnelShiftAsShifts(MI);
6041   return Result;
6042 }
6043 
6044 LegalizerHelper::LegalizeResult
6045 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
6046   Register Dst = MI.getOperand(0).getReg();
6047   Register Src = MI.getOperand(1).getReg();
6048   Register Amt = MI.getOperand(2).getReg();
6049   LLT AmtTy = MRI.getType(Amt);
6050   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6051   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6052   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6053   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6054   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
6055   MI.eraseFromParent();
6056   return Legalized;
6057 }
6058 
6059 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
6060   Register Dst = MI.getOperand(0).getReg();
6061   Register Src = MI.getOperand(1).getReg();
6062   Register Amt = MI.getOperand(2).getReg();
6063   LLT DstTy = MRI.getType(Dst);
6064   LLT SrcTy = MRI.getType(Dst);
6065   LLT AmtTy = MRI.getType(Amt);
6066 
6067   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
6068   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6069 
6070   MIRBuilder.setInstrAndDebugLoc(MI);
6071 
6072   // If a rotate in the other direction is supported, use it.
6073   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6074   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
6075       isPowerOf2_32(EltSizeInBits))
6076     return lowerRotateWithReverseRotate(MI);
6077 
6078   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6079   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
6080   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
6081   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
6082   Register ShVal;
6083   Register RevShiftVal;
6084   if (isPowerOf2_32(EltSizeInBits)) {
6085     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6086     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6087     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6088     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
6089     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6090     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
6091     RevShiftVal =
6092         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
6093   } else {
6094     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6095     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6096     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
6097     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
6098     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6099     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
6100     auto One = MIRBuilder.buildConstant(AmtTy, 1);
6101     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
6102     RevShiftVal =
6103         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
6104   }
6105   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
6106   MI.eraseFromParent();
6107   return Legalized;
6108 }
6109 
6110 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6111 // representation.
6112 LegalizerHelper::LegalizeResult
6113 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
6114   Register Dst = MI.getOperand(0).getReg();
6115   Register Src = MI.getOperand(1).getReg();
6116   const LLT S64 = LLT::scalar(64);
6117   const LLT S32 = LLT::scalar(32);
6118   const LLT S1 = LLT::scalar(1);
6119 
6120   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
6121 
6122   // unsigned cul2f(ulong u) {
6123   //   uint lz = clz(u);
6124   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
6125   //   u = (u << lz) & 0x7fffffffffffffffUL;
6126   //   ulong t = u & 0xffffffffffUL;
6127   //   uint v = (e << 23) | (uint)(u >> 40);
6128   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6129   //   return as_float(v + r);
6130   // }
6131 
6132   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
6133   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
6134 
6135   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
6136 
6137   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
6138   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
6139 
6140   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
6141   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
6142 
6143   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
6144   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
6145 
6146   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
6147 
6148   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
6149   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
6150 
6151   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
6152   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
6153   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
6154 
6155   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
6156   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
6157   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
6158   auto One = MIRBuilder.buildConstant(S32, 1);
6159 
6160   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
6161   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
6162   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
6163   MIRBuilder.buildAdd(Dst, V, R);
6164 
6165   MI.eraseFromParent();
6166   return Legalized;
6167 }
6168 
6169 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
6170   Register Dst = MI.getOperand(0).getReg();
6171   Register Src = MI.getOperand(1).getReg();
6172   LLT DstTy = MRI.getType(Dst);
6173   LLT SrcTy = MRI.getType(Src);
6174 
6175   if (SrcTy == LLT::scalar(1)) {
6176     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6177     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6178     MIRBuilder.buildSelect(Dst, Src, True, False);
6179     MI.eraseFromParent();
6180     return Legalized;
6181   }
6182 
6183   if (SrcTy != LLT::scalar(64))
6184     return UnableToLegalize;
6185 
6186   if (DstTy == LLT::scalar(32)) {
6187     // TODO: SelectionDAG has several alternative expansions to port which may
6188     // be more reasonble depending on the available instructions. If a target
6189     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6190     // intermediate type, this is probably worse.
6191     return lowerU64ToF32BitOps(MI);
6192   }
6193 
6194   return UnableToLegalize;
6195 }
6196 
6197 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6198   Register Dst = MI.getOperand(0).getReg();
6199   Register Src = MI.getOperand(1).getReg();
6200   LLT DstTy = MRI.getType(Dst);
6201   LLT SrcTy = MRI.getType(Src);
6202 
6203   const LLT S64 = LLT::scalar(64);
6204   const LLT S32 = LLT::scalar(32);
6205   const LLT S1 = LLT::scalar(1);
6206 
6207   if (SrcTy == S1) {
6208     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6209     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6210     MIRBuilder.buildSelect(Dst, Src, True, False);
6211     MI.eraseFromParent();
6212     return Legalized;
6213   }
6214 
6215   if (SrcTy != S64)
6216     return UnableToLegalize;
6217 
6218   if (DstTy == S32) {
6219     // signed cl2f(long l) {
6220     //   long s = l >> 63;
6221     //   float r = cul2f((l + s) ^ s);
6222     //   return s ? -r : r;
6223     // }
6224     Register L = Src;
6225     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6226     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6227 
6228     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6229     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6230     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6231 
6232     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6233     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6234                                             MIRBuilder.buildConstant(S64, 0));
6235     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6236     MI.eraseFromParent();
6237     return Legalized;
6238   }
6239 
6240   return UnableToLegalize;
6241 }
6242 
6243 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6244   Register Dst = MI.getOperand(0).getReg();
6245   Register Src = MI.getOperand(1).getReg();
6246   LLT DstTy = MRI.getType(Dst);
6247   LLT SrcTy = MRI.getType(Src);
6248   const LLT S64 = LLT::scalar(64);
6249   const LLT S32 = LLT::scalar(32);
6250 
6251   if (SrcTy != S64 && SrcTy != S32)
6252     return UnableToLegalize;
6253   if (DstTy != S32 && DstTy != S64)
6254     return UnableToLegalize;
6255 
6256   // FPTOSI gives same result as FPTOUI for positive signed integers.
6257   // FPTOUI needs to deal with fp values that convert to unsigned integers
6258   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6259 
6260   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6261   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6262                                                 : APFloat::IEEEdouble(),
6263                     APInt::getNullValue(SrcTy.getSizeInBits()));
6264   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6265 
6266   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6267 
6268   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6269   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6270   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6271   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6272   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6273   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6274   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6275 
6276   const LLT S1 = LLT::scalar(1);
6277 
6278   MachineInstrBuilder FCMP =
6279       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6280   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6281 
6282   MI.eraseFromParent();
6283   return Legalized;
6284 }
6285 
6286 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6287   Register Dst = MI.getOperand(0).getReg();
6288   Register Src = MI.getOperand(1).getReg();
6289   LLT DstTy = MRI.getType(Dst);
6290   LLT SrcTy = MRI.getType(Src);
6291   const LLT S64 = LLT::scalar(64);
6292   const LLT S32 = LLT::scalar(32);
6293 
6294   // FIXME: Only f32 to i64 conversions are supported.
6295   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6296     return UnableToLegalize;
6297 
6298   // Expand f32 -> i64 conversion
6299   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6300   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6301 
6302   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6303 
6304   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6305   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6306 
6307   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6308   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6309 
6310   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6311                                            APInt::getSignMask(SrcEltBits));
6312   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6313   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6314   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6315   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6316 
6317   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6318   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6319   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6320 
6321   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6322   R = MIRBuilder.buildZExt(DstTy, R);
6323 
6324   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6325   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6326   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6327   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6328 
6329   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6330   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6331 
6332   const LLT S1 = LLT::scalar(1);
6333   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6334                                     S1, Exponent, ExponentLoBit);
6335 
6336   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6337 
6338   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6339   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6340 
6341   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6342 
6343   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6344                                           S1, Exponent, ZeroSrcTy);
6345 
6346   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6347   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6348 
6349   MI.eraseFromParent();
6350   return Legalized;
6351 }
6352 
6353 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6354 LegalizerHelper::LegalizeResult
6355 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6356   Register Dst = MI.getOperand(0).getReg();
6357   Register Src = MI.getOperand(1).getReg();
6358 
6359   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6360     return UnableToLegalize;
6361 
6362   const unsigned ExpMask = 0x7ff;
6363   const unsigned ExpBiasf64 = 1023;
6364   const unsigned ExpBiasf16 = 15;
6365   const LLT S32 = LLT::scalar(32);
6366   const LLT S1 = LLT::scalar(1);
6367 
6368   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6369   Register U = Unmerge.getReg(0);
6370   Register UH = Unmerge.getReg(1);
6371 
6372   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6373   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6374 
6375   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6376   // add the f16 bias (15) to get the biased exponent for the f16 format.
6377   E = MIRBuilder.buildAdd(
6378     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6379 
6380   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6381   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6382 
6383   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6384                                        MIRBuilder.buildConstant(S32, 0x1ff));
6385   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6386 
6387   auto Zero = MIRBuilder.buildConstant(S32, 0);
6388   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6389   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6390   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6391 
6392   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6393   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6394   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6395   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6396 
6397   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6398   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6399 
6400   // N = M | (E << 12);
6401   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6402   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6403 
6404   // B = clamp(1-E, 0, 13);
6405   auto One = MIRBuilder.buildConstant(S32, 1);
6406   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6407   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6408   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6409 
6410   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6411                                        MIRBuilder.buildConstant(S32, 0x1000));
6412 
6413   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6414   auto D0 = MIRBuilder.buildShl(S32, D, B);
6415 
6416   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6417                                              D0, SigSetHigh);
6418   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6419   D = MIRBuilder.buildOr(S32, D, D1);
6420 
6421   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6422   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6423 
6424   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6425   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6426 
6427   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6428                                        MIRBuilder.buildConstant(S32, 3));
6429   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6430 
6431   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6432                                        MIRBuilder.buildConstant(S32, 5));
6433   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6434 
6435   V1 = MIRBuilder.buildOr(S32, V0, V1);
6436   V = MIRBuilder.buildAdd(S32, V, V1);
6437 
6438   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6439                                        E, MIRBuilder.buildConstant(S32, 30));
6440   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6441                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6442 
6443   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6444                                          E, MIRBuilder.buildConstant(S32, 1039));
6445   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6446 
6447   // Extract the sign bit.
6448   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6449   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6450 
6451   // Insert the sign bit
6452   V = MIRBuilder.buildOr(S32, Sign, V);
6453 
6454   MIRBuilder.buildTrunc(Dst, V);
6455   MI.eraseFromParent();
6456   return Legalized;
6457 }
6458 
6459 LegalizerHelper::LegalizeResult
6460 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6461   Register Dst = MI.getOperand(0).getReg();
6462   Register Src = MI.getOperand(1).getReg();
6463 
6464   LLT DstTy = MRI.getType(Dst);
6465   LLT SrcTy = MRI.getType(Src);
6466   const LLT S64 = LLT::scalar(64);
6467   const LLT S16 = LLT::scalar(16);
6468 
6469   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6470     return lowerFPTRUNC_F64_TO_F16(MI);
6471 
6472   return UnableToLegalize;
6473 }
6474 
6475 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6476 // multiplication tree.
6477 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6478   Register Dst = MI.getOperand(0).getReg();
6479   Register Src0 = MI.getOperand(1).getReg();
6480   Register Src1 = MI.getOperand(2).getReg();
6481   LLT Ty = MRI.getType(Dst);
6482 
6483   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6484   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6485   MI.eraseFromParent();
6486   return Legalized;
6487 }
6488 
6489 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6490   switch (Opc) {
6491   case TargetOpcode::G_SMIN:
6492     return CmpInst::ICMP_SLT;
6493   case TargetOpcode::G_SMAX:
6494     return CmpInst::ICMP_SGT;
6495   case TargetOpcode::G_UMIN:
6496     return CmpInst::ICMP_ULT;
6497   case TargetOpcode::G_UMAX:
6498     return CmpInst::ICMP_UGT;
6499   default:
6500     llvm_unreachable("not in integer min/max");
6501   }
6502 }
6503 
6504 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6505   Register Dst = MI.getOperand(0).getReg();
6506   Register Src0 = MI.getOperand(1).getReg();
6507   Register Src1 = MI.getOperand(2).getReg();
6508 
6509   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6510   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6511 
6512   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6513   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6514 
6515   MI.eraseFromParent();
6516   return Legalized;
6517 }
6518 
6519 LegalizerHelper::LegalizeResult
6520 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6521   Register Dst = MI.getOperand(0).getReg();
6522   Register Src0 = MI.getOperand(1).getReg();
6523   Register Src1 = MI.getOperand(2).getReg();
6524 
6525   const LLT Src0Ty = MRI.getType(Src0);
6526   const LLT Src1Ty = MRI.getType(Src1);
6527 
6528   const int Src0Size = Src0Ty.getScalarSizeInBits();
6529   const int Src1Size = Src1Ty.getScalarSizeInBits();
6530 
6531   auto SignBitMask = MIRBuilder.buildConstant(
6532     Src0Ty, APInt::getSignMask(Src0Size));
6533 
6534   auto NotSignBitMask = MIRBuilder.buildConstant(
6535     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6536 
6537   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6538   Register And1;
6539   if (Src0Ty == Src1Ty) {
6540     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6541   } else if (Src0Size > Src1Size) {
6542     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6543     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6544     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6545     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6546   } else {
6547     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6548     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6549     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6550     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6551   }
6552 
6553   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6554   // constants are a nan and -0.0, but the final result should preserve
6555   // everything.
6556   unsigned Flags = MI.getFlags();
6557   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6558 
6559   MI.eraseFromParent();
6560   return Legalized;
6561 }
6562 
6563 LegalizerHelper::LegalizeResult
6564 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6565   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6566     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6567 
6568   Register Dst = MI.getOperand(0).getReg();
6569   Register Src0 = MI.getOperand(1).getReg();
6570   Register Src1 = MI.getOperand(2).getReg();
6571   LLT Ty = MRI.getType(Dst);
6572 
6573   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6574     // Insert canonicalizes if it's possible we need to quiet to get correct
6575     // sNaN behavior.
6576 
6577     // Note this must be done here, and not as an optimization combine in the
6578     // absence of a dedicate quiet-snan instruction as we're using an
6579     // omni-purpose G_FCANONICALIZE.
6580     if (!isKnownNeverSNaN(Src0, MRI))
6581       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6582 
6583     if (!isKnownNeverSNaN(Src1, MRI))
6584       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6585   }
6586 
6587   // If there are no nans, it's safe to simply replace this with the non-IEEE
6588   // version.
6589   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6590   MI.eraseFromParent();
6591   return Legalized;
6592 }
6593 
6594 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6595   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6596   Register DstReg = MI.getOperand(0).getReg();
6597   LLT Ty = MRI.getType(DstReg);
6598   unsigned Flags = MI.getFlags();
6599 
6600   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6601                                   Flags);
6602   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6603   MI.eraseFromParent();
6604   return Legalized;
6605 }
6606 
6607 LegalizerHelper::LegalizeResult
6608 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6609   Register DstReg = MI.getOperand(0).getReg();
6610   Register X = MI.getOperand(1).getReg();
6611   const unsigned Flags = MI.getFlags();
6612   const LLT Ty = MRI.getType(DstReg);
6613   const LLT CondTy = Ty.changeElementSize(1);
6614 
6615   // round(x) =>
6616   //  t = trunc(x);
6617   //  d = fabs(x - t);
6618   //  o = copysign(1.0f, x);
6619   //  return t + (d >= 0.5 ? o : 0.0);
6620 
6621   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6622 
6623   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6624   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6625   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6626   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6627   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6628   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6629 
6630   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6631                                   Flags);
6632   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6633 
6634   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6635 
6636   MI.eraseFromParent();
6637   return Legalized;
6638 }
6639 
6640 LegalizerHelper::LegalizeResult
6641 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6642   Register DstReg = MI.getOperand(0).getReg();
6643   Register SrcReg = MI.getOperand(1).getReg();
6644   unsigned Flags = MI.getFlags();
6645   LLT Ty = MRI.getType(DstReg);
6646   const LLT CondTy = Ty.changeElementSize(1);
6647 
6648   // result = trunc(src);
6649   // if (src < 0.0 && src != result)
6650   //   result += -1.0.
6651 
6652   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6653   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6654 
6655   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6656                                   SrcReg, Zero, Flags);
6657   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6658                                       SrcReg, Trunc, Flags);
6659   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6660   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6661 
6662   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6663   MI.eraseFromParent();
6664   return Legalized;
6665 }
6666 
6667 LegalizerHelper::LegalizeResult
6668 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6669   const unsigned NumOps = MI.getNumOperands();
6670   Register DstReg = MI.getOperand(0).getReg();
6671   Register Src0Reg = MI.getOperand(1).getReg();
6672   LLT DstTy = MRI.getType(DstReg);
6673   LLT SrcTy = MRI.getType(Src0Reg);
6674   unsigned PartSize = SrcTy.getSizeInBits();
6675 
6676   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6677   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6678 
6679   for (unsigned I = 2; I != NumOps; ++I) {
6680     const unsigned Offset = (I - 1) * PartSize;
6681 
6682     Register SrcReg = MI.getOperand(I).getReg();
6683     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6684 
6685     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6686       MRI.createGenericVirtualRegister(WideTy);
6687 
6688     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6689     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6690     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6691     ResultReg = NextResult;
6692   }
6693 
6694   if (DstTy.isPointer()) {
6695     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6696           DstTy.getAddressSpace())) {
6697       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6698       return UnableToLegalize;
6699     }
6700 
6701     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6702   }
6703 
6704   MI.eraseFromParent();
6705   return Legalized;
6706 }
6707 
6708 LegalizerHelper::LegalizeResult
6709 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6710   const unsigned NumDst = MI.getNumOperands() - 1;
6711   Register SrcReg = MI.getOperand(NumDst).getReg();
6712   Register Dst0Reg = MI.getOperand(0).getReg();
6713   LLT DstTy = MRI.getType(Dst0Reg);
6714   if (DstTy.isPointer())
6715     return UnableToLegalize; // TODO
6716 
6717   SrcReg = coerceToScalar(SrcReg);
6718   if (!SrcReg)
6719     return UnableToLegalize;
6720 
6721   // Expand scalarizing unmerge as bitcast to integer and shift.
6722   LLT IntTy = MRI.getType(SrcReg);
6723 
6724   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6725 
6726   const unsigned DstSize = DstTy.getSizeInBits();
6727   unsigned Offset = DstSize;
6728   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6729     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6730     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6731     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6732   }
6733 
6734   MI.eraseFromParent();
6735   return Legalized;
6736 }
6737 
6738 /// Lower a vector extract or insert by writing the vector to a stack temporary
6739 /// and reloading the element or vector.
6740 ///
6741 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6742 ///  =>
6743 ///  %stack_temp = G_FRAME_INDEX
6744 ///  G_STORE %vec, %stack_temp
6745 ///  %idx = clamp(%idx, %vec.getNumElements())
6746 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6747 ///  %dst = G_LOAD %element_ptr
6748 LegalizerHelper::LegalizeResult
6749 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6750   Register DstReg = MI.getOperand(0).getReg();
6751   Register SrcVec = MI.getOperand(1).getReg();
6752   Register InsertVal;
6753   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6754     InsertVal = MI.getOperand(2).getReg();
6755 
6756   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6757 
6758   LLT VecTy = MRI.getType(SrcVec);
6759   LLT EltTy = VecTy.getElementType();
6760   if (!EltTy.isByteSized()) { // Not implemented.
6761     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6762     return UnableToLegalize;
6763   }
6764 
6765   unsigned EltBytes = EltTy.getSizeInBytes();
6766   Align VecAlign = getStackTemporaryAlignment(VecTy);
6767   Align EltAlign;
6768 
6769   MachinePointerInfo PtrInfo;
6770   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6771                                         VecAlign, PtrInfo);
6772   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6773 
6774   // Get the pointer to the element, and be sure not to hit undefined behavior
6775   // if the index is out of bounds.
6776   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6777 
6778   int64_t IdxVal;
6779   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6780     int64_t Offset = IdxVal * EltBytes;
6781     PtrInfo = PtrInfo.getWithOffset(Offset);
6782     EltAlign = commonAlignment(VecAlign, Offset);
6783   } else {
6784     // We lose information with a variable offset.
6785     EltAlign = getStackTemporaryAlignment(EltTy);
6786     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6787   }
6788 
6789   if (InsertVal) {
6790     // Write the inserted element
6791     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6792 
6793     // Reload the whole vector.
6794     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6795   } else {
6796     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6797   }
6798 
6799   MI.eraseFromParent();
6800   return Legalized;
6801 }
6802 
6803 LegalizerHelper::LegalizeResult
6804 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6805   Register DstReg = MI.getOperand(0).getReg();
6806   Register Src0Reg = MI.getOperand(1).getReg();
6807   Register Src1Reg = MI.getOperand(2).getReg();
6808   LLT Src0Ty = MRI.getType(Src0Reg);
6809   LLT DstTy = MRI.getType(DstReg);
6810   LLT IdxTy = LLT::scalar(32);
6811 
6812   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6813 
6814   if (DstTy.isScalar()) {
6815     if (Src0Ty.isVector())
6816       return UnableToLegalize;
6817 
6818     // This is just a SELECT.
6819     assert(Mask.size() == 1 && "Expected a single mask element");
6820     Register Val;
6821     if (Mask[0] < 0 || Mask[0] > 1)
6822       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6823     else
6824       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6825     MIRBuilder.buildCopy(DstReg, Val);
6826     MI.eraseFromParent();
6827     return Legalized;
6828   }
6829 
6830   Register Undef;
6831   SmallVector<Register, 32> BuildVec;
6832   LLT EltTy = DstTy.getElementType();
6833 
6834   for (int Idx : Mask) {
6835     if (Idx < 0) {
6836       if (!Undef.isValid())
6837         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6838       BuildVec.push_back(Undef);
6839       continue;
6840     }
6841 
6842     if (Src0Ty.isScalar()) {
6843       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6844     } else {
6845       int NumElts = Src0Ty.getNumElements();
6846       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6847       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6848       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6849       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6850       BuildVec.push_back(Extract.getReg(0));
6851     }
6852   }
6853 
6854   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6855   MI.eraseFromParent();
6856   return Legalized;
6857 }
6858 
6859 LegalizerHelper::LegalizeResult
6860 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6861   const auto &MF = *MI.getMF();
6862   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6863   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6864     return UnableToLegalize;
6865 
6866   Register Dst = MI.getOperand(0).getReg();
6867   Register AllocSize = MI.getOperand(1).getReg();
6868   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6869 
6870   LLT PtrTy = MRI.getType(Dst);
6871   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6872 
6873   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6874   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6875   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6876 
6877   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6878   // have to generate an extra instruction to negate the alloc and then use
6879   // G_PTR_ADD to add the negative offset.
6880   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6881   if (Alignment > Align(1)) {
6882     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6883     AlignMask.negate();
6884     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6885     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6886   }
6887 
6888   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6889   MIRBuilder.buildCopy(SPReg, SPTmp);
6890   MIRBuilder.buildCopy(Dst, SPTmp);
6891 
6892   MI.eraseFromParent();
6893   return Legalized;
6894 }
6895 
6896 LegalizerHelper::LegalizeResult
6897 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6898   Register Dst = MI.getOperand(0).getReg();
6899   Register Src = MI.getOperand(1).getReg();
6900   unsigned Offset = MI.getOperand(2).getImm();
6901 
6902   LLT DstTy = MRI.getType(Dst);
6903   LLT SrcTy = MRI.getType(Src);
6904 
6905   if (DstTy.isScalar() &&
6906       (SrcTy.isScalar() ||
6907        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6908     LLT SrcIntTy = SrcTy;
6909     if (!SrcTy.isScalar()) {
6910       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6911       Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6912     }
6913 
6914     if (Offset == 0)
6915       MIRBuilder.buildTrunc(Dst, Src);
6916     else {
6917       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6918       auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6919       MIRBuilder.buildTrunc(Dst, Shr);
6920     }
6921 
6922     MI.eraseFromParent();
6923     return Legalized;
6924   }
6925 
6926   return UnableToLegalize;
6927 }
6928 
6929 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6930   Register Dst = MI.getOperand(0).getReg();
6931   Register Src = MI.getOperand(1).getReg();
6932   Register InsertSrc = MI.getOperand(2).getReg();
6933   uint64_t Offset = MI.getOperand(3).getImm();
6934 
6935   LLT DstTy = MRI.getType(Src);
6936   LLT InsertTy = MRI.getType(InsertSrc);
6937 
6938   if (InsertTy.isVector() ||
6939       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6940     return UnableToLegalize;
6941 
6942   const DataLayout &DL = MIRBuilder.getDataLayout();
6943   if ((DstTy.isPointer() &&
6944        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6945       (InsertTy.isPointer() &&
6946        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6947     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6948     return UnableToLegalize;
6949   }
6950 
6951   LLT IntDstTy = DstTy;
6952 
6953   if (!DstTy.isScalar()) {
6954     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6955     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6956   }
6957 
6958   if (!InsertTy.isScalar()) {
6959     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6960     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6961   }
6962 
6963   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6964   if (Offset != 0) {
6965     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6966     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6967   }
6968 
6969   APInt MaskVal = APInt::getBitsSetWithWrap(
6970       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6971 
6972   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6973   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6974   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6975 
6976   MIRBuilder.buildCast(Dst, Or);
6977   MI.eraseFromParent();
6978   return Legalized;
6979 }
6980 
6981 LegalizerHelper::LegalizeResult
6982 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
6983   Register Dst0 = MI.getOperand(0).getReg();
6984   Register Dst1 = MI.getOperand(1).getReg();
6985   Register LHS = MI.getOperand(2).getReg();
6986   Register RHS = MI.getOperand(3).getReg();
6987   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
6988 
6989   LLT Ty = MRI.getType(Dst0);
6990   LLT BoolTy = MRI.getType(Dst1);
6991 
6992   if (IsAdd)
6993     MIRBuilder.buildAdd(Dst0, LHS, RHS);
6994   else
6995     MIRBuilder.buildSub(Dst0, LHS, RHS);
6996 
6997   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
6998 
6999   auto Zero = MIRBuilder.buildConstant(Ty, 0);
7000 
7001   // For an addition, the result should be less than one of the operands (LHS)
7002   // if and only if the other operand (RHS) is negative, otherwise there will
7003   // be overflow.
7004   // For a subtraction, the result should be less than one of the operands
7005   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
7006   // otherwise there will be overflow.
7007   auto ResultLowerThanLHS =
7008       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
7009   auto ConditionRHS = MIRBuilder.buildICmp(
7010       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
7011 
7012   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
7013   MI.eraseFromParent();
7014   return Legalized;
7015 }
7016 
7017 LegalizerHelper::LegalizeResult
7018 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
7019   Register Res = MI.getOperand(0).getReg();
7020   Register LHS = MI.getOperand(1).getReg();
7021   Register RHS = MI.getOperand(2).getReg();
7022   LLT Ty = MRI.getType(Res);
7023   bool IsSigned;
7024   bool IsAdd;
7025   unsigned BaseOp;
7026   switch (MI.getOpcode()) {
7027   default:
7028     llvm_unreachable("unexpected addsat/subsat opcode");
7029   case TargetOpcode::G_UADDSAT:
7030     IsSigned = false;
7031     IsAdd = true;
7032     BaseOp = TargetOpcode::G_ADD;
7033     break;
7034   case TargetOpcode::G_SADDSAT:
7035     IsSigned = true;
7036     IsAdd = true;
7037     BaseOp = TargetOpcode::G_ADD;
7038     break;
7039   case TargetOpcode::G_USUBSAT:
7040     IsSigned = false;
7041     IsAdd = false;
7042     BaseOp = TargetOpcode::G_SUB;
7043     break;
7044   case TargetOpcode::G_SSUBSAT:
7045     IsSigned = true;
7046     IsAdd = false;
7047     BaseOp = TargetOpcode::G_SUB;
7048     break;
7049   }
7050 
7051   if (IsSigned) {
7052     // sadd.sat(a, b) ->
7053     //   hi = 0x7fffffff - smax(a, 0)
7054     //   lo = 0x80000000 - smin(a, 0)
7055     //   a + smin(smax(lo, b), hi)
7056     // ssub.sat(a, b) ->
7057     //   lo = smax(a, -1) - 0x7fffffff
7058     //   hi = smin(a, -1) - 0x80000000
7059     //   a - smin(smax(lo, b), hi)
7060     // TODO: AMDGPU can use a "median of 3" instruction here:
7061     //   a +/- med3(lo, b, hi)
7062     uint64_t NumBits = Ty.getScalarSizeInBits();
7063     auto MaxVal =
7064         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
7065     auto MinVal =
7066         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7067     MachineInstrBuilder Hi, Lo;
7068     if (IsAdd) {
7069       auto Zero = MIRBuilder.buildConstant(Ty, 0);
7070       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
7071       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
7072     } else {
7073       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
7074       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
7075                                MaxVal);
7076       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
7077                                MinVal);
7078     }
7079     auto RHSClamped =
7080         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
7081     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
7082   } else {
7083     // uadd.sat(a, b) -> a + umin(~a, b)
7084     // usub.sat(a, b) -> a - umin(a, b)
7085     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
7086     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
7087     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
7088   }
7089 
7090   MI.eraseFromParent();
7091   return Legalized;
7092 }
7093 
7094 LegalizerHelper::LegalizeResult
7095 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7096   Register Res = MI.getOperand(0).getReg();
7097   Register LHS = MI.getOperand(1).getReg();
7098   Register RHS = MI.getOperand(2).getReg();
7099   LLT Ty = MRI.getType(Res);
7100   LLT BoolTy = Ty.changeElementSize(1);
7101   bool IsSigned;
7102   bool IsAdd;
7103   unsigned OverflowOp;
7104   switch (MI.getOpcode()) {
7105   default:
7106     llvm_unreachable("unexpected addsat/subsat opcode");
7107   case TargetOpcode::G_UADDSAT:
7108     IsSigned = false;
7109     IsAdd = true;
7110     OverflowOp = TargetOpcode::G_UADDO;
7111     break;
7112   case TargetOpcode::G_SADDSAT:
7113     IsSigned = true;
7114     IsAdd = true;
7115     OverflowOp = TargetOpcode::G_SADDO;
7116     break;
7117   case TargetOpcode::G_USUBSAT:
7118     IsSigned = false;
7119     IsAdd = false;
7120     OverflowOp = TargetOpcode::G_USUBO;
7121     break;
7122   case TargetOpcode::G_SSUBSAT:
7123     IsSigned = true;
7124     IsAdd = false;
7125     OverflowOp = TargetOpcode::G_SSUBO;
7126     break;
7127   }
7128 
7129   auto OverflowRes =
7130       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7131   Register Tmp = OverflowRes.getReg(0);
7132   Register Ov = OverflowRes.getReg(1);
7133   MachineInstrBuilder Clamp;
7134   if (IsSigned) {
7135     // sadd.sat(a, b) ->
7136     //   {tmp, ov} = saddo(a, b)
7137     //   ov ? (tmp >>s 31) + 0x80000000 : r
7138     // ssub.sat(a, b) ->
7139     //   {tmp, ov} = ssubo(a, b)
7140     //   ov ? (tmp >>s 31) + 0x80000000 : r
7141     uint64_t NumBits = Ty.getScalarSizeInBits();
7142     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7143     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7144     auto MinVal =
7145         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7146     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7147   } else {
7148     // uadd.sat(a, b) ->
7149     //   {tmp, ov} = uaddo(a, b)
7150     //   ov ? 0xffffffff : tmp
7151     // usub.sat(a, b) ->
7152     //   {tmp, ov} = usubo(a, b)
7153     //   ov ? 0 : tmp
7154     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7155   }
7156   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7157 
7158   MI.eraseFromParent();
7159   return Legalized;
7160 }
7161 
7162 LegalizerHelper::LegalizeResult
7163 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7164   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7165           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7166          "Expected shlsat opcode!");
7167   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7168   Register Res = MI.getOperand(0).getReg();
7169   Register LHS = MI.getOperand(1).getReg();
7170   Register RHS = MI.getOperand(2).getReg();
7171   LLT Ty = MRI.getType(Res);
7172   LLT BoolTy = Ty.changeElementSize(1);
7173 
7174   unsigned BW = Ty.getScalarSizeInBits();
7175   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7176   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7177                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7178 
7179   MachineInstrBuilder SatVal;
7180   if (IsSigned) {
7181     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7182     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7183     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7184                                     MIRBuilder.buildConstant(Ty, 0));
7185     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7186   } else {
7187     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7188   }
7189   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7190   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7191 
7192   MI.eraseFromParent();
7193   return Legalized;
7194 }
7195 
7196 LegalizerHelper::LegalizeResult
7197 LegalizerHelper::lowerBswap(MachineInstr &MI) {
7198   Register Dst = MI.getOperand(0).getReg();
7199   Register Src = MI.getOperand(1).getReg();
7200   const LLT Ty = MRI.getType(Src);
7201   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7202   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7203 
7204   // Swap most and least significant byte, set remaining bytes in Res to zero.
7205   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7206   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7207   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7208   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7209 
7210   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7211   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7212     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7213     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7214     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7215     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7216     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7217     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7218     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7219     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7220     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7221     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7222     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7223     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7224   }
7225   Res.getInstr()->getOperand(0).setReg(Dst);
7226 
7227   MI.eraseFromParent();
7228   return Legalized;
7229 }
7230 
7231 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7232 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7233                                  MachineInstrBuilder Src, APInt Mask) {
7234   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7235   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7236   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7237   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7238   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7239   return B.buildOr(Dst, LHS, RHS);
7240 }
7241 
7242 LegalizerHelper::LegalizeResult
7243 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7244   Register Dst = MI.getOperand(0).getReg();
7245   Register Src = MI.getOperand(1).getReg();
7246   const LLT Ty = MRI.getType(Src);
7247   unsigned Size = Ty.getSizeInBits();
7248 
7249   MachineInstrBuilder BSWAP =
7250       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7251 
7252   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7253   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7254   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7255   MachineInstrBuilder Swap4 =
7256       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7257 
7258   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7259   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7260   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7261   MachineInstrBuilder Swap2 =
7262       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7263 
7264   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7265   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7266   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7267   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7268 
7269   MI.eraseFromParent();
7270   return Legalized;
7271 }
7272 
7273 LegalizerHelper::LegalizeResult
7274 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7275   MachineFunction &MF = MIRBuilder.getMF();
7276 
7277   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7278   int NameOpIdx = IsRead ? 1 : 0;
7279   int ValRegIndex = IsRead ? 0 : 1;
7280 
7281   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7282   const LLT Ty = MRI.getType(ValReg);
7283   const MDString *RegStr = cast<MDString>(
7284     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7285 
7286   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7287   if (!PhysReg.isValid())
7288     return UnableToLegalize;
7289 
7290   if (IsRead)
7291     MIRBuilder.buildCopy(ValReg, PhysReg);
7292   else
7293     MIRBuilder.buildCopy(PhysReg, ValReg);
7294 
7295   MI.eraseFromParent();
7296   return Legalized;
7297 }
7298 
7299 LegalizerHelper::LegalizeResult
7300 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7301   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7302   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7303   Register Result = MI.getOperand(0).getReg();
7304   LLT OrigTy = MRI.getType(Result);
7305   auto SizeInBits = OrigTy.getScalarSizeInBits();
7306   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7307 
7308   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7309   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7310   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7311   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7312 
7313   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7314   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7315   MIRBuilder.buildTrunc(Result, Shifted);
7316 
7317   MI.eraseFromParent();
7318   return Legalized;
7319 }
7320 
7321 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7322   // Implement vector G_SELECT in terms of XOR, AND, OR.
7323   Register DstReg = MI.getOperand(0).getReg();
7324   Register MaskReg = MI.getOperand(1).getReg();
7325   Register Op1Reg = MI.getOperand(2).getReg();
7326   Register Op2Reg = MI.getOperand(3).getReg();
7327   LLT DstTy = MRI.getType(DstReg);
7328   LLT MaskTy = MRI.getType(MaskReg);
7329   LLT Op1Ty = MRI.getType(Op1Reg);
7330   if (!DstTy.isVector())
7331     return UnableToLegalize;
7332 
7333   // Vector selects can have a scalar predicate. If so, splat into a vector and
7334   // finish for later legalization attempts to try again.
7335   if (MaskTy.isScalar()) {
7336     Register MaskElt = MaskReg;
7337     if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
7338       MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
7339     // Generate a vector splat idiom to be pattern matched later.
7340     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7341     Observer.changingInstr(MI);
7342     MI.getOperand(1).setReg(ShufSplat.getReg(0));
7343     Observer.changedInstr(MI);
7344     return Legalized;
7345   }
7346 
7347   if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
7348     return UnableToLegalize;
7349   }
7350 
7351   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7352   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7353   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7354   MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7355   MI.eraseFromParent();
7356   return Legalized;
7357 }
7358 
7359 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7360   // Split DIVREM into individual instructions.
7361   unsigned Opcode = MI.getOpcode();
7362 
7363   MIRBuilder.buildInstr(
7364       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7365                                         : TargetOpcode::G_UDIV,
7366       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7367   MIRBuilder.buildInstr(
7368       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7369                                         : TargetOpcode::G_UREM,
7370       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7371   MI.eraseFromParent();
7372   return Legalized;
7373 }
7374 
7375 LegalizerHelper::LegalizeResult
7376 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7377   // Expand %res = G_ABS %a into:
7378   // %v1 = G_ASHR %a, scalar_size-1
7379   // %v2 = G_ADD %a, %v1
7380   // %res = G_XOR %v2, %v1
7381   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7382   Register OpReg = MI.getOperand(1).getReg();
7383   auto ShiftAmt =
7384       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7385   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7386   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7387   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7388   MI.eraseFromParent();
7389   return Legalized;
7390 }
7391 
7392 LegalizerHelper::LegalizeResult
7393 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7394   // Expand %res = G_ABS %a into:
7395   // %v1 = G_CONSTANT 0
7396   // %v2 = G_SUB %v1, %a
7397   // %res = G_SMAX %a, %v2
7398   Register SrcReg = MI.getOperand(1).getReg();
7399   LLT Ty = MRI.getType(SrcReg);
7400   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7401   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7402   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7403   MI.eraseFromParent();
7404   return Legalized;
7405 }
7406 
7407 LegalizerHelper::LegalizeResult LegalizerHelper::lowerIsNaN(MachineInstr &MI) {
7408   Register Dst = MI.getOperand(0).getReg();
7409   Register Src = MI.getOperand(1).getReg();
7410   LLT SrcTy = MRI.getType(Src);
7411   if (MI.getFlags() & MachineInstr::NoFPExcept) {
7412     // Lower to an unordered comparison.
7413     auto Zero = MIRBuilder.buildFConstant(SrcTy, 0.0);
7414     MIRBuilder.buildFCmp(CmpInst::Predicate::FCMP_UNO, Dst, Src, Zero);
7415     MI.eraseFromParent();
7416     return Legalized;
7417   }
7418 
7419   // Use integer operations to avoid traps if the argument is SNaN.
7420 
7421   // NaN has all exp bits set and a non zero significand. Therefore:
7422   // isnan(V) == exp mask < abs(V)
7423   auto FPToSI = MIRBuilder.buildFPTOSI(SrcTy, Src);
7424   auto Mask = APInt::getSignedMaxValue(SrcTy.getScalarSizeInBits());
7425   auto MaskCst = MIRBuilder.buildConstant(SrcTy, Mask);
7426   auto AbsV = MIRBuilder.buildAnd(SrcTy, FPToSI, MaskCst);
7427   auto *FloatTy = getFloatTypeForLLT(MI.getMF()->getFunction().getContext(),
7428                                      SrcTy.getScalarType());
7429   if (!FloatTy)
7430     return UnableToLegalize;
7431   auto ExpMask = APFloat::getInf(FloatTy->getFltSemantics()).bitcastToAPInt();
7432   auto ExpMaskCst = MIRBuilder.buildConstant(SrcTy, ExpMask);
7433   MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, Dst, ExpMaskCst, AbsV);
7434   MI.eraseFromParent();
7435   return Legalized;
7436 }
7437 
7438 LegalizerHelper::LegalizeResult
7439 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
7440   Register SrcReg = MI.getOperand(1).getReg();
7441   LLT SrcTy = MRI.getType(SrcReg);
7442   LLT DstTy = MRI.getType(SrcReg);
7443 
7444   // The source could be a scalar if the IR type was <1 x sN>.
7445   if (SrcTy.isScalar()) {
7446     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
7447       return UnableToLegalize; // FIXME: handle extension.
7448     // This can be just a plain copy.
7449     Observer.changingInstr(MI);
7450     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
7451     Observer.changedInstr(MI);
7452     return Legalized;
7453   }
7454   return UnableToLegalize;;
7455 }
7456