1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/TargetFrameLowering.h"
24 #include "llvm/CodeGen/TargetInstrInfo.h"
25 #include "llvm/CodeGen/TargetLowering.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/Target/TargetMachine.h"
33 
34 #define DEBUG_TYPE "legalizer"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace MIPatternMatch;
39 
40 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
41 ///
42 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
43 /// with any leftover piece as type \p LeftoverTy
44 ///
45 /// Returns -1 in the first element of the pair if the breakdown is not
46 /// satisfiable.
47 static std::pair<int, int>
48 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
49   assert(!LeftoverTy.isValid() && "this is an out argument");
50 
51   unsigned Size = OrigTy.getSizeInBits();
52   unsigned NarrowSize = NarrowTy.getSizeInBits();
53   unsigned NumParts = Size / NarrowSize;
54   unsigned LeftoverSize = Size - NumParts * NarrowSize;
55   assert(Size > NarrowSize);
56 
57   if (LeftoverSize == 0)
58     return {NumParts, 0};
59 
60   if (NarrowTy.isVector()) {
61     unsigned EltSize = OrigTy.getScalarSizeInBits();
62     if (LeftoverSize % EltSize != 0)
63       return {-1, -1};
64     LeftoverTy = LLT::scalarOrVector(
65         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
66   } else {
67     LeftoverTy = LLT::scalar(LeftoverSize);
68   }
69 
70   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
71   return std::make_pair(NumParts, NumLeftover);
72 }
73 
74 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
75 
76   if (!Ty.isScalar())
77     return nullptr;
78 
79   switch (Ty.getSizeInBits()) {
80   case 16:
81     return Type::getHalfTy(Ctx);
82   case 32:
83     return Type::getFloatTy(Ctx);
84   case 64:
85     return Type::getDoubleTy(Ctx);
86   case 80:
87     return Type::getX86_FP80Ty(Ctx);
88   case 128:
89     return Type::getFP128Ty(Ctx);
90   default:
91     return nullptr;
92   }
93 }
94 
95 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
96                                  GISelChangeObserver &Observer,
97                                  MachineIRBuilder &Builder)
98     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
99       LI(*MF.getSubtarget().getLegalizerInfo()),
100       TLI(*MF.getSubtarget().getTargetLowering()) { }
101 
102 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
103                                  GISelChangeObserver &Observer,
104                                  MachineIRBuilder &B)
105   : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
106     TLI(*MF.getSubtarget().getTargetLowering()) { }
107 
108 LegalizerHelper::LegalizeResult
109 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
110                                    LostDebugLocObserver &LocObserver) {
111   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
112 
113   MIRBuilder.setInstrAndDebugLoc(MI);
114 
115   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
116       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
117     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
118   auto Step = LI.getAction(MI, MRI);
119   switch (Step.Action) {
120   case Legal:
121     LLVM_DEBUG(dbgs() << ".. Already legal\n");
122     return AlreadyLegal;
123   case Libcall:
124     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
125     return libcall(MI, LocObserver);
126   case NarrowScalar:
127     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
128     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
129   case WidenScalar:
130     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
131     return widenScalar(MI, Step.TypeIdx, Step.NewType);
132   case Bitcast:
133     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
134     return bitcast(MI, Step.TypeIdx, Step.NewType);
135   case Lower:
136     LLVM_DEBUG(dbgs() << ".. Lower\n");
137     return lower(MI, Step.TypeIdx, Step.NewType);
138   case FewerElements:
139     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
140     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
141   case MoreElements:
142     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
143     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
144   case Custom:
145     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
146     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
147   default:
148     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
149     return UnableToLegalize;
150   }
151 }
152 
153 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
154                                    SmallVectorImpl<Register> &VRegs) {
155   for (int i = 0; i < NumParts; ++i)
156     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
157   MIRBuilder.buildUnmerge(VRegs, Reg);
158 }
159 
160 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
161                                    LLT MainTy, LLT &LeftoverTy,
162                                    SmallVectorImpl<Register> &VRegs,
163                                    SmallVectorImpl<Register> &LeftoverRegs) {
164   assert(!LeftoverTy.isValid() && "this is an out argument");
165 
166   unsigned RegSize = RegTy.getSizeInBits();
167   unsigned MainSize = MainTy.getSizeInBits();
168   unsigned NumParts = RegSize / MainSize;
169   unsigned LeftoverSize = RegSize - NumParts * MainSize;
170 
171   // Use an unmerge when possible.
172   if (LeftoverSize == 0) {
173     for (unsigned I = 0; I < NumParts; ++I)
174       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
175     MIRBuilder.buildUnmerge(VRegs, Reg);
176     return true;
177   }
178 
179   if (MainTy.isVector()) {
180     unsigned EltSize = MainTy.getScalarSizeInBits();
181     if (LeftoverSize % EltSize != 0)
182       return false;
183     LeftoverTy = LLT::scalarOrVector(
184         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
185   } else {
186     LeftoverTy = LLT::scalar(LeftoverSize);
187   }
188 
189   // For irregular sizes, extract the individual parts.
190   for (unsigned I = 0; I != NumParts; ++I) {
191     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
192     VRegs.push_back(NewReg);
193     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
194   }
195 
196   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
197        Offset += LeftoverSize) {
198     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
199     LeftoverRegs.push_back(NewReg);
200     MIRBuilder.buildExtract(NewReg, Reg, Offset);
201   }
202 
203   return true;
204 }
205 
206 void LegalizerHelper::insertParts(Register DstReg,
207                                   LLT ResultTy, LLT PartTy,
208                                   ArrayRef<Register> PartRegs,
209                                   LLT LeftoverTy,
210                                   ArrayRef<Register> LeftoverRegs) {
211   if (!LeftoverTy.isValid()) {
212     assert(LeftoverRegs.empty());
213 
214     if (!ResultTy.isVector()) {
215       MIRBuilder.buildMerge(DstReg, PartRegs);
216       return;
217     }
218 
219     if (PartTy.isVector())
220       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
221     else
222       MIRBuilder.buildBuildVector(DstReg, PartRegs);
223     return;
224   }
225 
226   SmallVector<Register> GCDRegs;
227   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
228   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
229     extractGCDType(GCDRegs, GCDTy, PartReg);
230   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
231   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
232 }
233 
234 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
235 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
236                               const MachineInstr &MI) {
237   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
238 
239   const int StartIdx = Regs.size();
240   const int NumResults = MI.getNumOperands() - 1;
241   Regs.resize(Regs.size() + NumResults);
242   for (int I = 0; I != NumResults; ++I)
243     Regs[StartIdx + I] = MI.getOperand(I).getReg();
244 }
245 
246 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
247                                      LLT GCDTy, Register SrcReg) {
248   LLT SrcTy = MRI.getType(SrcReg);
249   if (SrcTy == GCDTy) {
250     // If the source already evenly divides the result type, we don't need to do
251     // anything.
252     Parts.push_back(SrcReg);
253   } else {
254     // Need to split into common type sized pieces.
255     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
256     getUnmergeResults(Parts, *Unmerge);
257   }
258 }
259 
260 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
261                                     LLT NarrowTy, Register SrcReg) {
262   LLT SrcTy = MRI.getType(SrcReg);
263   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
264   extractGCDType(Parts, GCDTy, SrcReg);
265   return GCDTy;
266 }
267 
268 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
269                                          SmallVectorImpl<Register> &VRegs,
270                                          unsigned PadStrategy) {
271   LLT LCMTy = getLCMType(DstTy, NarrowTy);
272 
273   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
274   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
275   int NumOrigSrc = VRegs.size();
276 
277   Register PadReg;
278 
279   // Get a value we can use to pad the source value if the sources won't evenly
280   // cover the result type.
281   if (NumOrigSrc < NumParts * NumSubParts) {
282     if (PadStrategy == TargetOpcode::G_ZEXT)
283       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
284     else if (PadStrategy == TargetOpcode::G_ANYEXT)
285       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
286     else {
287       assert(PadStrategy == TargetOpcode::G_SEXT);
288 
289       // Shift the sign bit of the low register through the high register.
290       auto ShiftAmt =
291         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
292       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
293     }
294   }
295 
296   // Registers for the final merge to be produced.
297   SmallVector<Register, 4> Remerge(NumParts);
298 
299   // Registers needed for intermediate merges, which will be merged into a
300   // source for Remerge.
301   SmallVector<Register, 4> SubMerge(NumSubParts);
302 
303   // Once we've fully read off the end of the original source bits, we can reuse
304   // the same high bits for remaining padding elements.
305   Register AllPadReg;
306 
307   // Build merges to the LCM type to cover the original result type.
308   for (int I = 0; I != NumParts; ++I) {
309     bool AllMergePartsArePadding = true;
310 
311     // Build the requested merges to the requested type.
312     for (int J = 0; J != NumSubParts; ++J) {
313       int Idx = I * NumSubParts + J;
314       if (Idx >= NumOrigSrc) {
315         SubMerge[J] = PadReg;
316         continue;
317       }
318 
319       SubMerge[J] = VRegs[Idx];
320 
321       // There are meaningful bits here we can't reuse later.
322       AllMergePartsArePadding = false;
323     }
324 
325     // If we've filled up a complete piece with padding bits, we can directly
326     // emit the natural sized constant if applicable, rather than a merge of
327     // smaller constants.
328     if (AllMergePartsArePadding && !AllPadReg) {
329       if (PadStrategy == TargetOpcode::G_ANYEXT)
330         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
331       else if (PadStrategy == TargetOpcode::G_ZEXT)
332         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
333 
334       // If this is a sign extension, we can't materialize a trivial constant
335       // with the right type and have to produce a merge.
336     }
337 
338     if (AllPadReg) {
339       // Avoid creating additional instructions if we're just adding additional
340       // copies of padding bits.
341       Remerge[I] = AllPadReg;
342       continue;
343     }
344 
345     if (NumSubParts == 1)
346       Remerge[I] = SubMerge[0];
347     else
348       Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
349 
350     // In the sign extend padding case, re-use the first all-signbit merge.
351     if (AllMergePartsArePadding && !AllPadReg)
352       AllPadReg = Remerge[I];
353   }
354 
355   VRegs = std::move(Remerge);
356   return LCMTy;
357 }
358 
359 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
360                                                ArrayRef<Register> RemergeRegs) {
361   LLT DstTy = MRI.getType(DstReg);
362 
363   // Create the merge to the widened source, and extract the relevant bits into
364   // the result.
365 
366   if (DstTy == LCMTy) {
367     MIRBuilder.buildMerge(DstReg, RemergeRegs);
368     return;
369   }
370 
371   auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
372   if (DstTy.isScalar() && LCMTy.isScalar()) {
373     MIRBuilder.buildTrunc(DstReg, Remerge);
374     return;
375   }
376 
377   if (LCMTy.isVector()) {
378     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
379     SmallVector<Register, 8> UnmergeDefs(NumDefs);
380     UnmergeDefs[0] = DstReg;
381     for (unsigned I = 1; I != NumDefs; ++I)
382       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
383 
384     MIRBuilder.buildUnmerge(UnmergeDefs,
385                             MIRBuilder.buildMerge(LCMTy, RemergeRegs));
386     return;
387   }
388 
389   llvm_unreachable("unhandled case");
390 }
391 
392 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
393 #define RTLIBCASE_INT(LibcallPrefix)                                           \
394   do {                                                                         \
395     switch (Size) {                                                            \
396     case 32:                                                                   \
397       return RTLIB::LibcallPrefix##32;                                         \
398     case 64:                                                                   \
399       return RTLIB::LibcallPrefix##64;                                         \
400     case 128:                                                                  \
401       return RTLIB::LibcallPrefix##128;                                        \
402     default:                                                                   \
403       llvm_unreachable("unexpected size");                                     \
404     }                                                                          \
405   } while (0)
406 
407 #define RTLIBCASE(LibcallPrefix)                                               \
408   do {                                                                         \
409     switch (Size) {                                                            \
410     case 32:                                                                   \
411       return RTLIB::LibcallPrefix##32;                                         \
412     case 64:                                                                   \
413       return RTLIB::LibcallPrefix##64;                                         \
414     case 80:                                                                   \
415       return RTLIB::LibcallPrefix##80;                                         \
416     case 128:                                                                  \
417       return RTLIB::LibcallPrefix##128;                                        \
418     default:                                                                   \
419       llvm_unreachable("unexpected size");                                     \
420     }                                                                          \
421   } while (0)
422 
423   switch (Opcode) {
424   case TargetOpcode::G_SDIV:
425     RTLIBCASE_INT(SDIV_I);
426   case TargetOpcode::G_UDIV:
427     RTLIBCASE_INT(UDIV_I);
428   case TargetOpcode::G_SREM:
429     RTLIBCASE_INT(SREM_I);
430   case TargetOpcode::G_UREM:
431     RTLIBCASE_INT(UREM_I);
432   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
433     RTLIBCASE_INT(CTLZ_I);
434   case TargetOpcode::G_FADD:
435     RTLIBCASE(ADD_F);
436   case TargetOpcode::G_FSUB:
437     RTLIBCASE(SUB_F);
438   case TargetOpcode::G_FMUL:
439     RTLIBCASE(MUL_F);
440   case TargetOpcode::G_FDIV:
441     RTLIBCASE(DIV_F);
442   case TargetOpcode::G_FEXP:
443     RTLIBCASE(EXP_F);
444   case TargetOpcode::G_FEXP2:
445     RTLIBCASE(EXP2_F);
446   case TargetOpcode::G_FREM:
447     RTLIBCASE(REM_F);
448   case TargetOpcode::G_FPOW:
449     RTLIBCASE(POW_F);
450   case TargetOpcode::G_FMA:
451     RTLIBCASE(FMA_F);
452   case TargetOpcode::G_FSIN:
453     RTLIBCASE(SIN_F);
454   case TargetOpcode::G_FCOS:
455     RTLIBCASE(COS_F);
456   case TargetOpcode::G_FLOG10:
457     RTLIBCASE(LOG10_F);
458   case TargetOpcode::G_FLOG:
459     RTLIBCASE(LOG_F);
460   case TargetOpcode::G_FLOG2:
461     RTLIBCASE(LOG2_F);
462   case TargetOpcode::G_FCEIL:
463     RTLIBCASE(CEIL_F);
464   case TargetOpcode::G_FFLOOR:
465     RTLIBCASE(FLOOR_F);
466   case TargetOpcode::G_FMINNUM:
467     RTLIBCASE(FMIN_F);
468   case TargetOpcode::G_FMAXNUM:
469     RTLIBCASE(FMAX_F);
470   case TargetOpcode::G_FSQRT:
471     RTLIBCASE(SQRT_F);
472   case TargetOpcode::G_FRINT:
473     RTLIBCASE(RINT_F);
474   case TargetOpcode::G_FNEARBYINT:
475     RTLIBCASE(NEARBYINT_F);
476   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
477     RTLIBCASE(ROUNDEVEN_F);
478   }
479   llvm_unreachable("Unknown libcall function");
480 }
481 
482 /// True if an instruction is in tail position in its caller. Intended for
483 /// legalizing libcalls as tail calls when possible.
484 static bool isLibCallInTailPosition(MachineInstr &MI,
485                                     const TargetInstrInfo &TII,
486                                     MachineRegisterInfo &MRI) {
487   MachineBasicBlock &MBB = *MI.getParent();
488   const Function &F = MBB.getParent()->getFunction();
489 
490   // Conservatively require the attributes of the call to match those of
491   // the return. Ignore NoAlias and NonNull because they don't affect the
492   // call sequence.
493   AttributeList CallerAttrs = F.getAttributes();
494   if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
495           .removeAttribute(Attribute::NoAlias)
496           .removeAttribute(Attribute::NonNull)
497           .hasAttributes())
498     return false;
499 
500   // It's not safe to eliminate the sign / zero extension of the return value.
501   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
502       CallerAttrs.hasRetAttr(Attribute::SExt))
503     return false;
504 
505   // Only tail call if the following instruction is a standard return or if we
506   // have a `thisreturn` callee, and a sequence like:
507   //
508   //   G_MEMCPY %0, %1, %2
509   //   $x0 = COPY %0
510   //   RET_ReallyLR implicit $x0
511   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
512   if (Next != MBB.instr_end() && Next->isCopy()) {
513     switch (MI.getOpcode()) {
514     default:
515       llvm_unreachable("unsupported opcode");
516     case TargetOpcode::G_BZERO:
517       return false;
518     case TargetOpcode::G_MEMCPY:
519     case TargetOpcode::G_MEMMOVE:
520     case TargetOpcode::G_MEMSET:
521       break;
522     }
523 
524     Register VReg = MI.getOperand(0).getReg();
525     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
526       return false;
527 
528     Register PReg = Next->getOperand(0).getReg();
529     if (!PReg.isPhysical())
530       return false;
531 
532     auto Ret = next_nodbg(Next, MBB.instr_end());
533     if (Ret == MBB.instr_end() || !Ret->isReturn())
534       return false;
535 
536     if (Ret->getNumImplicitOperands() != 1)
537       return false;
538 
539     if (PReg != Ret->getOperand(0).getReg())
540       return false;
541 
542     // Skip over the COPY that we just validated.
543     Next = Ret;
544   }
545 
546   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
547     return false;
548 
549   return true;
550 }
551 
552 LegalizerHelper::LegalizeResult
553 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
554                     const CallLowering::ArgInfo &Result,
555                     ArrayRef<CallLowering::ArgInfo> Args,
556                     const CallingConv::ID CC) {
557   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
558 
559   CallLowering::CallLoweringInfo Info;
560   Info.CallConv = CC;
561   Info.Callee = MachineOperand::CreateES(Name);
562   Info.OrigRet = Result;
563   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
564   if (!CLI.lowerCall(MIRBuilder, Info))
565     return LegalizerHelper::UnableToLegalize;
566 
567   return LegalizerHelper::Legalized;
568 }
569 
570 LegalizerHelper::LegalizeResult
571 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
572                     const CallLowering::ArgInfo &Result,
573                     ArrayRef<CallLowering::ArgInfo> Args) {
574   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
575   const char *Name = TLI.getLibcallName(Libcall);
576   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
577   return createLibcall(MIRBuilder, Name, Result, Args, CC);
578 }
579 
580 // Useful for libcalls where all operands have the same type.
581 static LegalizerHelper::LegalizeResult
582 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
583               Type *OpType) {
584   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
585 
586   // FIXME: What does the original arg index mean here?
587   SmallVector<CallLowering::ArgInfo, 3> Args;
588   for (unsigned i = 1; i < MI.getNumOperands(); i++)
589     Args.push_back({MI.getOperand(i).getReg(), OpType, 0});
590   return createLibcall(MIRBuilder, Libcall,
591                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
592 }
593 
594 LegalizerHelper::LegalizeResult
595 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
596                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
597   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
598 
599   SmallVector<CallLowering::ArgInfo, 3> Args;
600   // Add all the args, except for the last which is an imm denoting 'tail'.
601   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
602     Register Reg = MI.getOperand(i).getReg();
603 
604     // Need derive an IR type for call lowering.
605     LLT OpLLT = MRI.getType(Reg);
606     Type *OpTy = nullptr;
607     if (OpLLT.isPointer())
608       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
609     else
610       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
611     Args.push_back({Reg, OpTy, 0});
612   }
613 
614   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
615   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
616   RTLIB::Libcall RTLibcall;
617   unsigned Opc = MI.getOpcode();
618   switch (Opc) {
619   case TargetOpcode::G_BZERO:
620     RTLibcall = RTLIB::BZERO;
621     break;
622   case TargetOpcode::G_MEMCPY:
623     RTLibcall = RTLIB::MEMCPY;
624     Args[0].Flags[0].setReturned();
625     break;
626   case TargetOpcode::G_MEMMOVE:
627     RTLibcall = RTLIB::MEMMOVE;
628     Args[0].Flags[0].setReturned();
629     break;
630   case TargetOpcode::G_MEMSET:
631     RTLibcall = RTLIB::MEMSET;
632     Args[0].Flags[0].setReturned();
633     break;
634   default:
635     llvm_unreachable("unsupported opcode");
636   }
637   const char *Name = TLI.getLibcallName(RTLibcall);
638 
639   // Unsupported libcall on the target.
640   if (!Name) {
641     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
642                       << MIRBuilder.getTII().getName(Opc) << "\n");
643     return LegalizerHelper::UnableToLegalize;
644   }
645 
646   CallLowering::CallLoweringInfo Info;
647   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
648   Info.Callee = MachineOperand::CreateES(Name);
649   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
650   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
651                     isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
652 
653   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
654   if (!CLI.lowerCall(MIRBuilder, Info))
655     return LegalizerHelper::UnableToLegalize;
656 
657   if (Info.LoweredTailCall) {
658     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
659 
660     // Check debug locations before removing the return.
661     LocObserver.checkpoint(true);
662 
663     // We must have a return following the call (or debug insts) to get past
664     // isLibCallInTailPosition.
665     do {
666       MachineInstr *Next = MI.getNextNode();
667       assert(Next &&
668              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
669              "Expected instr following MI to be return or debug inst?");
670       // We lowered a tail call, so the call is now the return from the block.
671       // Delete the old return.
672       Next->eraseFromParent();
673     } while (MI.getNextNode());
674 
675     // We expect to lose the debug location from the return.
676     LocObserver.checkpoint(false);
677   }
678 
679   return LegalizerHelper::Legalized;
680 }
681 
682 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
683                                        Type *FromType) {
684   auto ToMVT = MVT::getVT(ToType);
685   auto FromMVT = MVT::getVT(FromType);
686 
687   switch (Opcode) {
688   case TargetOpcode::G_FPEXT:
689     return RTLIB::getFPEXT(FromMVT, ToMVT);
690   case TargetOpcode::G_FPTRUNC:
691     return RTLIB::getFPROUND(FromMVT, ToMVT);
692   case TargetOpcode::G_FPTOSI:
693     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
694   case TargetOpcode::G_FPTOUI:
695     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
696   case TargetOpcode::G_SITOFP:
697     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
698   case TargetOpcode::G_UITOFP:
699     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
700   }
701   llvm_unreachable("Unsupported libcall function");
702 }
703 
704 static LegalizerHelper::LegalizeResult
705 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
706                   Type *FromType) {
707   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
708   return createLibcall(MIRBuilder, Libcall,
709                        {MI.getOperand(0).getReg(), ToType, 0},
710                        {{MI.getOperand(1).getReg(), FromType, 0}});
711 }
712 
713 LegalizerHelper::LegalizeResult
714 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
715   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
716   unsigned Size = LLTy.getSizeInBits();
717   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
718 
719   switch (MI.getOpcode()) {
720   default:
721     return UnableToLegalize;
722   case TargetOpcode::G_SDIV:
723   case TargetOpcode::G_UDIV:
724   case TargetOpcode::G_SREM:
725   case TargetOpcode::G_UREM:
726   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
727     Type *HLTy = IntegerType::get(Ctx, Size);
728     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
729     if (Status != Legalized)
730       return Status;
731     break;
732   }
733   case TargetOpcode::G_FADD:
734   case TargetOpcode::G_FSUB:
735   case TargetOpcode::G_FMUL:
736   case TargetOpcode::G_FDIV:
737   case TargetOpcode::G_FMA:
738   case TargetOpcode::G_FPOW:
739   case TargetOpcode::G_FREM:
740   case TargetOpcode::G_FCOS:
741   case TargetOpcode::G_FSIN:
742   case TargetOpcode::G_FLOG10:
743   case TargetOpcode::G_FLOG:
744   case TargetOpcode::G_FLOG2:
745   case TargetOpcode::G_FEXP:
746   case TargetOpcode::G_FEXP2:
747   case TargetOpcode::G_FCEIL:
748   case TargetOpcode::G_FFLOOR:
749   case TargetOpcode::G_FMINNUM:
750   case TargetOpcode::G_FMAXNUM:
751   case TargetOpcode::G_FSQRT:
752   case TargetOpcode::G_FRINT:
753   case TargetOpcode::G_FNEARBYINT:
754   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
755     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
756     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
757       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
758       return UnableToLegalize;
759     }
760     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
761     if (Status != Legalized)
762       return Status;
763     break;
764   }
765   case TargetOpcode::G_FPEXT:
766   case TargetOpcode::G_FPTRUNC: {
767     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
768     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
769     if (!FromTy || !ToTy)
770       return UnableToLegalize;
771     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
772     if (Status != Legalized)
773       return Status;
774     break;
775   }
776   case TargetOpcode::G_FPTOSI:
777   case TargetOpcode::G_FPTOUI: {
778     // FIXME: Support other types
779     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
780     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
781     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
782       return UnableToLegalize;
783     LegalizeResult Status = conversionLibcall(
784         MI, MIRBuilder,
785         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
786         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
787     if (Status != Legalized)
788       return Status;
789     break;
790   }
791   case TargetOpcode::G_SITOFP:
792   case TargetOpcode::G_UITOFP: {
793     // FIXME: Support other types
794     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
795     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
796     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
797       return UnableToLegalize;
798     LegalizeResult Status = conversionLibcall(
799         MI, MIRBuilder,
800         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
801         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
802     if (Status != Legalized)
803       return Status;
804     break;
805   }
806   case TargetOpcode::G_BZERO:
807   case TargetOpcode::G_MEMCPY:
808   case TargetOpcode::G_MEMMOVE:
809   case TargetOpcode::G_MEMSET: {
810     LegalizeResult Result =
811         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
812     if (Result != Legalized)
813       return Result;
814     MI.eraseFromParent();
815     return Result;
816   }
817   }
818 
819   MI.eraseFromParent();
820   return Legalized;
821 }
822 
823 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
824                                                               unsigned TypeIdx,
825                                                               LLT NarrowTy) {
826   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
827   uint64_t NarrowSize = NarrowTy.getSizeInBits();
828 
829   switch (MI.getOpcode()) {
830   default:
831     return UnableToLegalize;
832   case TargetOpcode::G_IMPLICIT_DEF: {
833     Register DstReg = MI.getOperand(0).getReg();
834     LLT DstTy = MRI.getType(DstReg);
835 
836     // If SizeOp0 is not an exact multiple of NarrowSize, emit
837     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
838     // FIXME: Although this would also be legal for the general case, it causes
839     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
840     //  combines not being hit). This seems to be a problem related to the
841     //  artifact combiner.
842     if (SizeOp0 % NarrowSize != 0) {
843       LLT ImplicitTy = NarrowTy;
844       if (DstTy.isVector())
845         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
846 
847       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
848       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
849 
850       MI.eraseFromParent();
851       return Legalized;
852     }
853 
854     int NumParts = SizeOp0 / NarrowSize;
855 
856     SmallVector<Register, 2> DstRegs;
857     for (int i = 0; i < NumParts; ++i)
858       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
859 
860     if (DstTy.isVector())
861       MIRBuilder.buildBuildVector(DstReg, DstRegs);
862     else
863       MIRBuilder.buildMerge(DstReg, DstRegs);
864     MI.eraseFromParent();
865     return Legalized;
866   }
867   case TargetOpcode::G_CONSTANT: {
868     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
869     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
870     unsigned TotalSize = Ty.getSizeInBits();
871     unsigned NarrowSize = NarrowTy.getSizeInBits();
872     int NumParts = TotalSize / NarrowSize;
873 
874     SmallVector<Register, 4> PartRegs;
875     for (int I = 0; I != NumParts; ++I) {
876       unsigned Offset = I * NarrowSize;
877       auto K = MIRBuilder.buildConstant(NarrowTy,
878                                         Val.lshr(Offset).trunc(NarrowSize));
879       PartRegs.push_back(K.getReg(0));
880     }
881 
882     LLT LeftoverTy;
883     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
884     SmallVector<Register, 1> LeftoverRegs;
885     if (LeftoverBits != 0) {
886       LeftoverTy = LLT::scalar(LeftoverBits);
887       auto K = MIRBuilder.buildConstant(
888         LeftoverTy,
889         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
890       LeftoverRegs.push_back(K.getReg(0));
891     }
892 
893     insertParts(MI.getOperand(0).getReg(),
894                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
895 
896     MI.eraseFromParent();
897     return Legalized;
898   }
899   case TargetOpcode::G_SEXT:
900   case TargetOpcode::G_ZEXT:
901   case TargetOpcode::G_ANYEXT:
902     return narrowScalarExt(MI, TypeIdx, NarrowTy);
903   case TargetOpcode::G_TRUNC: {
904     if (TypeIdx != 1)
905       return UnableToLegalize;
906 
907     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
908     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
909       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
910       return UnableToLegalize;
911     }
912 
913     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
914     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
915     MI.eraseFromParent();
916     return Legalized;
917   }
918 
919   case TargetOpcode::G_FREEZE:
920     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
921   case TargetOpcode::G_ADD:
922   case TargetOpcode::G_SUB:
923   case TargetOpcode::G_SADDO:
924   case TargetOpcode::G_SSUBO:
925   case TargetOpcode::G_SADDE:
926   case TargetOpcode::G_SSUBE:
927   case TargetOpcode::G_UADDO:
928   case TargetOpcode::G_USUBO:
929   case TargetOpcode::G_UADDE:
930   case TargetOpcode::G_USUBE:
931     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
932   case TargetOpcode::G_MUL:
933   case TargetOpcode::G_UMULH:
934     return narrowScalarMul(MI, NarrowTy);
935   case TargetOpcode::G_EXTRACT:
936     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
937   case TargetOpcode::G_INSERT:
938     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
939   case TargetOpcode::G_LOAD: {
940     auto &LoadMI = cast<GLoad>(MI);
941     Register DstReg = LoadMI.getDstReg();
942     LLT DstTy = MRI.getType(DstReg);
943     if (DstTy.isVector())
944       return UnableToLegalize;
945 
946     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
947       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
948       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
949       MIRBuilder.buildAnyExt(DstReg, TmpReg);
950       LoadMI.eraseFromParent();
951       return Legalized;
952     }
953 
954     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
955   }
956   case TargetOpcode::G_ZEXTLOAD:
957   case TargetOpcode::G_SEXTLOAD: {
958     auto &LoadMI = cast<GExtLoad>(MI);
959     Register DstReg = LoadMI.getDstReg();
960     Register PtrReg = LoadMI.getPointerReg();
961 
962     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
963     auto &MMO = LoadMI.getMMO();
964     unsigned MemSize = MMO.getSizeInBits();
965 
966     if (MemSize == NarrowSize) {
967       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
968     } else if (MemSize < NarrowSize) {
969       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
970     } else if (MemSize > NarrowSize) {
971       // FIXME: Need to split the load.
972       return UnableToLegalize;
973     }
974 
975     if (isa<GZExtLoad>(LoadMI))
976       MIRBuilder.buildZExt(DstReg, TmpReg);
977     else
978       MIRBuilder.buildSExt(DstReg, TmpReg);
979 
980     LoadMI.eraseFromParent();
981     return Legalized;
982   }
983   case TargetOpcode::G_STORE: {
984     auto &StoreMI = cast<GStore>(MI);
985 
986     Register SrcReg = StoreMI.getValueReg();
987     LLT SrcTy = MRI.getType(SrcReg);
988     if (SrcTy.isVector())
989       return UnableToLegalize;
990 
991     int NumParts = SizeOp0 / NarrowSize;
992     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
993     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
994     if (SrcTy.isVector() && LeftoverBits != 0)
995       return UnableToLegalize;
996 
997     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
998       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
999       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1000       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1001       StoreMI.eraseFromParent();
1002       return Legalized;
1003     }
1004 
1005     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1006   }
1007   case TargetOpcode::G_SELECT:
1008     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1009   case TargetOpcode::G_AND:
1010   case TargetOpcode::G_OR:
1011   case TargetOpcode::G_XOR: {
1012     // Legalize bitwise operation:
1013     // A = BinOp<Ty> B, C
1014     // into:
1015     // B1, ..., BN = G_UNMERGE_VALUES B
1016     // C1, ..., CN = G_UNMERGE_VALUES C
1017     // A1 = BinOp<Ty/N> B1, C2
1018     // ...
1019     // AN = BinOp<Ty/N> BN, CN
1020     // A = G_MERGE_VALUES A1, ..., AN
1021     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1022   }
1023   case TargetOpcode::G_SHL:
1024   case TargetOpcode::G_LSHR:
1025   case TargetOpcode::G_ASHR:
1026     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1027   case TargetOpcode::G_CTLZ:
1028   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1029   case TargetOpcode::G_CTTZ:
1030   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1031   case TargetOpcode::G_CTPOP:
1032     if (TypeIdx == 1)
1033       switch (MI.getOpcode()) {
1034       case TargetOpcode::G_CTLZ:
1035       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1036         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1037       case TargetOpcode::G_CTTZ:
1038       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1039         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1040       case TargetOpcode::G_CTPOP:
1041         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1042       default:
1043         return UnableToLegalize;
1044       }
1045 
1046     Observer.changingInstr(MI);
1047     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1048     Observer.changedInstr(MI);
1049     return Legalized;
1050   case TargetOpcode::G_INTTOPTR:
1051     if (TypeIdx != 1)
1052       return UnableToLegalize;
1053 
1054     Observer.changingInstr(MI);
1055     narrowScalarSrc(MI, NarrowTy, 1);
1056     Observer.changedInstr(MI);
1057     return Legalized;
1058   case TargetOpcode::G_PTRTOINT:
1059     if (TypeIdx != 0)
1060       return UnableToLegalize;
1061 
1062     Observer.changingInstr(MI);
1063     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1064     Observer.changedInstr(MI);
1065     return Legalized;
1066   case TargetOpcode::G_PHI: {
1067     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1068     // NarrowSize.
1069     if (SizeOp0 % NarrowSize != 0)
1070       return UnableToLegalize;
1071 
1072     unsigned NumParts = SizeOp0 / NarrowSize;
1073     SmallVector<Register, 2> DstRegs(NumParts);
1074     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1075     Observer.changingInstr(MI);
1076     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1077       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1078       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1079       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1080                    SrcRegs[i / 2]);
1081     }
1082     MachineBasicBlock &MBB = *MI.getParent();
1083     MIRBuilder.setInsertPt(MBB, MI);
1084     for (unsigned i = 0; i < NumParts; ++i) {
1085       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1086       MachineInstrBuilder MIB =
1087           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1088       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1089         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1090     }
1091     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1092     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1093     Observer.changedInstr(MI);
1094     MI.eraseFromParent();
1095     return Legalized;
1096   }
1097   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1098   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1099     if (TypeIdx != 2)
1100       return UnableToLegalize;
1101 
1102     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1103     Observer.changingInstr(MI);
1104     narrowScalarSrc(MI, NarrowTy, OpIdx);
1105     Observer.changedInstr(MI);
1106     return Legalized;
1107   }
1108   case TargetOpcode::G_ICMP: {
1109     Register LHS = MI.getOperand(2).getReg();
1110     LLT SrcTy = MRI.getType(LHS);
1111     uint64_t SrcSize = SrcTy.getSizeInBits();
1112     CmpInst::Predicate Pred =
1113         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1114 
1115     // TODO: Handle the non-equality case for weird sizes.
1116     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1117       return UnableToLegalize;
1118 
1119     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1120     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1121     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1122                       LHSLeftoverRegs))
1123       return UnableToLegalize;
1124 
1125     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1126     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1127     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1128                       RHSPartRegs, RHSLeftoverRegs))
1129       return UnableToLegalize;
1130 
1131     // We now have the LHS and RHS of the compare split into narrow-type
1132     // registers, plus potentially some leftover type.
1133     Register Dst = MI.getOperand(0).getReg();
1134     LLT ResTy = MRI.getType(Dst);
1135     if (ICmpInst::isEquality(Pred)) {
1136       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1137       // them together. For each equal part, the result should be all 0s. For
1138       // each non-equal part, we'll get at least one 1.
1139       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1140       SmallVector<Register, 4> Xors;
1141       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1142         auto LHS = std::get<0>(LHSAndRHS);
1143         auto RHS = std::get<1>(LHSAndRHS);
1144         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1145         Xors.push_back(Xor);
1146       }
1147 
1148       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1149       // to the desired narrow type so that we can OR them together later.
1150       SmallVector<Register, 4> WidenedXors;
1151       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1152         auto LHS = std::get<0>(LHSAndRHS);
1153         auto RHS = std::get<1>(LHSAndRHS);
1154         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1155         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1156         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1157                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1158         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1159       }
1160 
1161       // Now, for each part we broke up, we know if they are equal/not equal
1162       // based off the G_XOR. We can OR these all together and compare against
1163       // 0 to get the result.
1164       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1165       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1166       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1167         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1168       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1169     } else {
1170       // TODO: Handle non-power-of-two types.
1171       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1172       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1173       Register LHSL = LHSPartRegs[0];
1174       Register LHSH = LHSPartRegs[1];
1175       Register RHSL = RHSPartRegs[0];
1176       Register RHSH = RHSPartRegs[1];
1177       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1178       MachineInstrBuilder CmpHEQ =
1179           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1180       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1181           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1182       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1183     }
1184     MI.eraseFromParent();
1185     return Legalized;
1186   }
1187   case TargetOpcode::G_SEXT_INREG: {
1188     if (TypeIdx != 0)
1189       return UnableToLegalize;
1190 
1191     int64_t SizeInBits = MI.getOperand(2).getImm();
1192 
1193     // So long as the new type has more bits than the bits we're extending we
1194     // don't need to break it apart.
1195     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1196       Observer.changingInstr(MI);
1197       // We don't lose any non-extension bits by truncating the src and
1198       // sign-extending the dst.
1199       MachineOperand &MO1 = MI.getOperand(1);
1200       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1201       MO1.setReg(TruncMIB.getReg(0));
1202 
1203       MachineOperand &MO2 = MI.getOperand(0);
1204       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1205       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1206       MIRBuilder.buildSExt(MO2, DstExt);
1207       MO2.setReg(DstExt);
1208       Observer.changedInstr(MI);
1209       return Legalized;
1210     }
1211 
1212     // Break it apart. Components below the extension point are unmodified. The
1213     // component containing the extension point becomes a narrower SEXT_INREG.
1214     // Components above it are ashr'd from the component containing the
1215     // extension point.
1216     if (SizeOp0 % NarrowSize != 0)
1217       return UnableToLegalize;
1218     int NumParts = SizeOp0 / NarrowSize;
1219 
1220     // List the registers where the destination will be scattered.
1221     SmallVector<Register, 2> DstRegs;
1222     // List the registers where the source will be split.
1223     SmallVector<Register, 2> SrcRegs;
1224 
1225     // Create all the temporary registers.
1226     for (int i = 0; i < NumParts; ++i) {
1227       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1228 
1229       SrcRegs.push_back(SrcReg);
1230     }
1231 
1232     // Explode the big arguments into smaller chunks.
1233     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1234 
1235     Register AshrCstReg =
1236         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1237             .getReg(0);
1238     Register FullExtensionReg = 0;
1239     Register PartialExtensionReg = 0;
1240 
1241     // Do the operation on each small part.
1242     for (int i = 0; i < NumParts; ++i) {
1243       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1244         DstRegs.push_back(SrcRegs[i]);
1245       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1246         assert(PartialExtensionReg &&
1247                "Expected to visit partial extension before full");
1248         if (FullExtensionReg) {
1249           DstRegs.push_back(FullExtensionReg);
1250           continue;
1251         }
1252         DstRegs.push_back(
1253             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1254                 .getReg(0));
1255         FullExtensionReg = DstRegs.back();
1256       } else {
1257         DstRegs.push_back(
1258             MIRBuilder
1259                 .buildInstr(
1260                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1261                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1262                 .getReg(0));
1263         PartialExtensionReg = DstRegs.back();
1264       }
1265     }
1266 
1267     // Gather the destination registers into the final destination.
1268     Register DstReg = MI.getOperand(0).getReg();
1269     MIRBuilder.buildMerge(DstReg, DstRegs);
1270     MI.eraseFromParent();
1271     return Legalized;
1272   }
1273   case TargetOpcode::G_BSWAP:
1274   case TargetOpcode::G_BITREVERSE: {
1275     if (SizeOp0 % NarrowSize != 0)
1276       return UnableToLegalize;
1277 
1278     Observer.changingInstr(MI);
1279     SmallVector<Register, 2> SrcRegs, DstRegs;
1280     unsigned NumParts = SizeOp0 / NarrowSize;
1281     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1282 
1283     for (unsigned i = 0; i < NumParts; ++i) {
1284       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1285                                            {SrcRegs[NumParts - 1 - i]});
1286       DstRegs.push_back(DstPart.getReg(0));
1287     }
1288 
1289     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1290 
1291     Observer.changedInstr(MI);
1292     MI.eraseFromParent();
1293     return Legalized;
1294   }
1295   case TargetOpcode::G_PTR_ADD:
1296   case TargetOpcode::G_PTRMASK: {
1297     if (TypeIdx != 1)
1298       return UnableToLegalize;
1299     Observer.changingInstr(MI);
1300     narrowScalarSrc(MI, NarrowTy, 2);
1301     Observer.changedInstr(MI);
1302     return Legalized;
1303   }
1304   case TargetOpcode::G_FPTOUI:
1305   case TargetOpcode::G_FPTOSI:
1306     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1307   case TargetOpcode::G_FPEXT:
1308     if (TypeIdx != 0)
1309       return UnableToLegalize;
1310     Observer.changingInstr(MI);
1311     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1312     Observer.changedInstr(MI);
1313     return Legalized;
1314   }
1315 }
1316 
1317 Register LegalizerHelper::coerceToScalar(Register Val) {
1318   LLT Ty = MRI.getType(Val);
1319   if (Ty.isScalar())
1320     return Val;
1321 
1322   const DataLayout &DL = MIRBuilder.getDataLayout();
1323   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1324   if (Ty.isPointer()) {
1325     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1326       return Register();
1327     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1328   }
1329 
1330   Register NewVal = Val;
1331 
1332   assert(Ty.isVector());
1333   LLT EltTy = Ty.getElementType();
1334   if (EltTy.isPointer())
1335     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1336   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1337 }
1338 
1339 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1340                                      unsigned OpIdx, unsigned ExtOpcode) {
1341   MachineOperand &MO = MI.getOperand(OpIdx);
1342   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1343   MO.setReg(ExtB.getReg(0));
1344 }
1345 
1346 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1347                                       unsigned OpIdx) {
1348   MachineOperand &MO = MI.getOperand(OpIdx);
1349   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1350   MO.setReg(ExtB.getReg(0));
1351 }
1352 
1353 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1354                                      unsigned OpIdx, unsigned TruncOpcode) {
1355   MachineOperand &MO = MI.getOperand(OpIdx);
1356   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1357   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1358   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1359   MO.setReg(DstExt);
1360 }
1361 
1362 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1363                                       unsigned OpIdx, unsigned ExtOpcode) {
1364   MachineOperand &MO = MI.getOperand(OpIdx);
1365   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1366   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1367   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1368   MO.setReg(DstTrunc);
1369 }
1370 
1371 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1372                                             unsigned OpIdx) {
1373   MachineOperand &MO = MI.getOperand(OpIdx);
1374   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1375   MO.setReg(widenWithUnmerge(WideTy, MO.getReg()));
1376 }
1377 
1378 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1379                                             unsigned OpIdx) {
1380   MachineOperand &MO = MI.getOperand(OpIdx);
1381 
1382   LLT OldTy = MRI.getType(MO.getReg());
1383   unsigned OldElts = OldTy.getNumElements();
1384   unsigned NewElts = MoreTy.getNumElements();
1385 
1386   unsigned NumParts = NewElts / OldElts;
1387 
1388   // Use concat_vectors if the result is a multiple of the number of elements.
1389   if (NumParts * OldElts == NewElts) {
1390     SmallVector<Register, 8> Parts;
1391     Parts.push_back(MO.getReg());
1392 
1393     Register ImpDef = MIRBuilder.buildUndef(OldTy).getReg(0);
1394     for (unsigned I = 1; I != NumParts; ++I)
1395       Parts.push_back(ImpDef);
1396 
1397     auto Concat = MIRBuilder.buildConcatVectors(MoreTy, Parts);
1398     MO.setReg(Concat.getReg(0));
1399     return;
1400   }
1401 
1402   Register MoreReg = MRI.createGenericVirtualRegister(MoreTy);
1403   Register ImpDef = MIRBuilder.buildUndef(MoreTy).getReg(0);
1404   MIRBuilder.buildInsert(MoreReg, ImpDef, MO.getReg(), 0);
1405   MO.setReg(MoreReg);
1406 }
1407 
1408 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1409   MachineOperand &Op = MI.getOperand(OpIdx);
1410   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1411 }
1412 
1413 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1414   MachineOperand &MO = MI.getOperand(OpIdx);
1415   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1416   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1417   MIRBuilder.buildBitcast(MO, CastDst);
1418   MO.setReg(CastDst);
1419 }
1420 
1421 LegalizerHelper::LegalizeResult
1422 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1423                                         LLT WideTy) {
1424   if (TypeIdx != 1)
1425     return UnableToLegalize;
1426 
1427   Register DstReg = MI.getOperand(0).getReg();
1428   LLT DstTy = MRI.getType(DstReg);
1429   if (DstTy.isVector())
1430     return UnableToLegalize;
1431 
1432   Register Src1 = MI.getOperand(1).getReg();
1433   LLT SrcTy = MRI.getType(Src1);
1434   const int DstSize = DstTy.getSizeInBits();
1435   const int SrcSize = SrcTy.getSizeInBits();
1436   const int WideSize = WideTy.getSizeInBits();
1437   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1438 
1439   unsigned NumOps = MI.getNumOperands();
1440   unsigned NumSrc = MI.getNumOperands() - 1;
1441   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1442 
1443   if (WideSize >= DstSize) {
1444     // Directly pack the bits in the target type.
1445     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1446 
1447     for (unsigned I = 2; I != NumOps; ++I) {
1448       const unsigned Offset = (I - 1) * PartSize;
1449 
1450       Register SrcReg = MI.getOperand(I).getReg();
1451       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1452 
1453       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1454 
1455       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1456         MRI.createGenericVirtualRegister(WideTy);
1457 
1458       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1459       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1460       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1461       ResultReg = NextResult;
1462     }
1463 
1464     if (WideSize > DstSize)
1465       MIRBuilder.buildTrunc(DstReg, ResultReg);
1466     else if (DstTy.isPointer())
1467       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1468 
1469     MI.eraseFromParent();
1470     return Legalized;
1471   }
1472 
1473   // Unmerge the original values to the GCD type, and recombine to the next
1474   // multiple greater than the original type.
1475   //
1476   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1477   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1478   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1479   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1480   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1481   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1482   // %12:_(s12) = G_MERGE_VALUES %10, %11
1483   //
1484   // Padding with undef if necessary:
1485   //
1486   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1487   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1488   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1489   // %7:_(s2) = G_IMPLICIT_DEF
1490   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1491   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1492   // %10:_(s12) = G_MERGE_VALUES %8, %9
1493 
1494   const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1495   LLT GCDTy = LLT::scalar(GCD);
1496 
1497   SmallVector<Register, 8> Parts;
1498   SmallVector<Register, 8> NewMergeRegs;
1499   SmallVector<Register, 8> Unmerges;
1500   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1501 
1502   // Decompose the original operands if they don't evenly divide.
1503   for (int I = 1, E = MI.getNumOperands(); I != E; ++I) {
1504     Register SrcReg = MI.getOperand(I).getReg();
1505     if (GCD == SrcSize) {
1506       Unmerges.push_back(SrcReg);
1507     } else {
1508       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1509       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1510         Unmerges.push_back(Unmerge.getReg(J));
1511     }
1512   }
1513 
1514   // Pad with undef to the next size that is a multiple of the requested size.
1515   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1516     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1517     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1518       Unmerges.push_back(UndefReg);
1519   }
1520 
1521   const int PartsPerGCD = WideSize / GCD;
1522 
1523   // Build merges of each piece.
1524   ArrayRef<Register> Slicer(Unmerges);
1525   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1526     auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1527     NewMergeRegs.push_back(Merge.getReg(0));
1528   }
1529 
1530   // A truncate may be necessary if the requested type doesn't evenly divide the
1531   // original result type.
1532   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1533     MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1534   } else {
1535     auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1536     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1537   }
1538 
1539   MI.eraseFromParent();
1540   return Legalized;
1541 }
1542 
1543 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
1544   Register WideReg = MRI.createGenericVirtualRegister(WideTy);
1545   LLT OrigTy = MRI.getType(OrigReg);
1546   LLT LCMTy = getLCMType(WideTy, OrigTy);
1547 
1548   const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
1549   const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
1550 
1551   Register UnmergeSrc = WideReg;
1552 
1553   // Create a merge to the LCM type, padding with undef
1554   // %0:_(<3 x s32>) = G_FOO => <4 x s32>
1555   // =>
1556   // %1:_(<4 x s32>) = G_FOO
1557   // %2:_(<4 x s32>) = G_IMPLICIT_DEF
1558   // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
1559   // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
1560   if (NumMergeParts > 1) {
1561     Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
1562     SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
1563     MergeParts[0] = WideReg;
1564     UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
1565   }
1566 
1567   // Unmerge to the original register and pad with dead defs.
1568   SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
1569   UnmergeResults[0] = OrigReg;
1570   for (int I = 1; I != NumUnmergeParts; ++I)
1571     UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
1572 
1573   MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
1574   return WideReg;
1575 }
1576 
1577 LegalizerHelper::LegalizeResult
1578 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1579                                           LLT WideTy) {
1580   if (TypeIdx != 0)
1581     return UnableToLegalize;
1582 
1583   int NumDst = MI.getNumOperands() - 1;
1584   Register SrcReg = MI.getOperand(NumDst).getReg();
1585   LLT SrcTy = MRI.getType(SrcReg);
1586   if (SrcTy.isVector())
1587     return UnableToLegalize;
1588 
1589   Register Dst0Reg = MI.getOperand(0).getReg();
1590   LLT DstTy = MRI.getType(Dst0Reg);
1591   if (!DstTy.isScalar())
1592     return UnableToLegalize;
1593 
1594   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1595     if (SrcTy.isPointer()) {
1596       const DataLayout &DL = MIRBuilder.getDataLayout();
1597       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1598         LLVM_DEBUG(
1599             dbgs() << "Not casting non-integral address space integer\n");
1600         return UnableToLegalize;
1601       }
1602 
1603       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1604       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1605     }
1606 
1607     // Widen SrcTy to WideTy. This does not affect the result, but since the
1608     // user requested this size, it is probably better handled than SrcTy and
1609     // should reduce the total number of legalization artifacts
1610     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1611       SrcTy = WideTy;
1612       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1613     }
1614 
1615     // Theres no unmerge type to target. Directly extract the bits from the
1616     // source type
1617     unsigned DstSize = DstTy.getSizeInBits();
1618 
1619     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1620     for (int I = 1; I != NumDst; ++I) {
1621       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1622       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1623       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1624     }
1625 
1626     MI.eraseFromParent();
1627     return Legalized;
1628   }
1629 
1630   // Extend the source to a wider type.
1631   LLT LCMTy = getLCMType(SrcTy, WideTy);
1632 
1633   Register WideSrc = SrcReg;
1634   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1635     // TODO: If this is an integral address space, cast to integer and anyext.
1636     if (SrcTy.isPointer()) {
1637       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1638       return UnableToLegalize;
1639     }
1640 
1641     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1642   }
1643 
1644   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1645 
1646   // Create a sequence of unmerges and merges to the original results. Since we
1647   // may have widened the source, we will need to pad the results with dead defs
1648   // to cover the source register.
1649   // e.g. widen s48 to s64:
1650   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1651   //
1652   // =>
1653   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1654   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1655   //  ; unpack to GCD type, with extra dead defs
1656   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1657   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1658   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1659   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1660   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1661   const LLT GCDTy = getGCDType(WideTy, DstTy);
1662   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1663   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1664 
1665   // Directly unmerge to the destination without going through a GCD type
1666   // if possible
1667   if (PartsPerRemerge == 1) {
1668     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1669 
1670     for (int I = 0; I != NumUnmerge; ++I) {
1671       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1672 
1673       for (int J = 0; J != PartsPerUnmerge; ++J) {
1674         int Idx = I * PartsPerUnmerge + J;
1675         if (Idx < NumDst)
1676           MIB.addDef(MI.getOperand(Idx).getReg());
1677         else {
1678           // Create dead def for excess components.
1679           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1680         }
1681       }
1682 
1683       MIB.addUse(Unmerge.getReg(I));
1684     }
1685   } else {
1686     SmallVector<Register, 16> Parts;
1687     for (int J = 0; J != NumUnmerge; ++J)
1688       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1689 
1690     SmallVector<Register, 8> RemergeParts;
1691     for (int I = 0; I != NumDst; ++I) {
1692       for (int J = 0; J < PartsPerRemerge; ++J) {
1693         const int Idx = I * PartsPerRemerge + J;
1694         RemergeParts.emplace_back(Parts[Idx]);
1695       }
1696 
1697       MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1698       RemergeParts.clear();
1699     }
1700   }
1701 
1702   MI.eraseFromParent();
1703   return Legalized;
1704 }
1705 
1706 LegalizerHelper::LegalizeResult
1707 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1708                                     LLT WideTy) {
1709   Register DstReg = MI.getOperand(0).getReg();
1710   Register SrcReg = MI.getOperand(1).getReg();
1711   LLT SrcTy = MRI.getType(SrcReg);
1712 
1713   LLT DstTy = MRI.getType(DstReg);
1714   unsigned Offset = MI.getOperand(2).getImm();
1715 
1716   if (TypeIdx == 0) {
1717     if (SrcTy.isVector() || DstTy.isVector())
1718       return UnableToLegalize;
1719 
1720     SrcOp Src(SrcReg);
1721     if (SrcTy.isPointer()) {
1722       // Extracts from pointers can be handled only if they are really just
1723       // simple integers.
1724       const DataLayout &DL = MIRBuilder.getDataLayout();
1725       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1726         return UnableToLegalize;
1727 
1728       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1729       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1730       SrcTy = SrcAsIntTy;
1731     }
1732 
1733     if (DstTy.isPointer())
1734       return UnableToLegalize;
1735 
1736     if (Offset == 0) {
1737       // Avoid a shift in the degenerate case.
1738       MIRBuilder.buildTrunc(DstReg,
1739                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1740       MI.eraseFromParent();
1741       return Legalized;
1742     }
1743 
1744     // Do a shift in the source type.
1745     LLT ShiftTy = SrcTy;
1746     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1747       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1748       ShiftTy = WideTy;
1749     }
1750 
1751     auto LShr = MIRBuilder.buildLShr(
1752       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1753     MIRBuilder.buildTrunc(DstReg, LShr);
1754     MI.eraseFromParent();
1755     return Legalized;
1756   }
1757 
1758   if (SrcTy.isScalar()) {
1759     Observer.changingInstr(MI);
1760     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1761     Observer.changedInstr(MI);
1762     return Legalized;
1763   }
1764 
1765   if (!SrcTy.isVector())
1766     return UnableToLegalize;
1767 
1768   if (DstTy != SrcTy.getElementType())
1769     return UnableToLegalize;
1770 
1771   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1772     return UnableToLegalize;
1773 
1774   Observer.changingInstr(MI);
1775   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1776 
1777   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1778                           Offset);
1779   widenScalarDst(MI, WideTy.getScalarType(), 0);
1780   Observer.changedInstr(MI);
1781   return Legalized;
1782 }
1783 
1784 LegalizerHelper::LegalizeResult
1785 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1786                                    LLT WideTy) {
1787   if (TypeIdx != 0 || WideTy.isVector())
1788     return UnableToLegalize;
1789   Observer.changingInstr(MI);
1790   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1791   widenScalarDst(MI, WideTy);
1792   Observer.changedInstr(MI);
1793   return Legalized;
1794 }
1795 
1796 LegalizerHelper::LegalizeResult
1797 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1798                                            LLT WideTy) {
1799   if (TypeIdx == 1)
1800     return UnableToLegalize; // TODO
1801 
1802   unsigned Opcode;
1803   unsigned ExtOpcode;
1804   Optional<Register> CarryIn = None;
1805   switch (MI.getOpcode()) {
1806   default:
1807     llvm_unreachable("Unexpected opcode!");
1808   case TargetOpcode::G_SADDO:
1809     Opcode = TargetOpcode::G_ADD;
1810     ExtOpcode = TargetOpcode::G_SEXT;
1811     break;
1812   case TargetOpcode::G_SSUBO:
1813     Opcode = TargetOpcode::G_SUB;
1814     ExtOpcode = TargetOpcode::G_SEXT;
1815     break;
1816   case TargetOpcode::G_UADDO:
1817     Opcode = TargetOpcode::G_ADD;
1818     ExtOpcode = TargetOpcode::G_ZEXT;
1819     break;
1820   case TargetOpcode::G_USUBO:
1821     Opcode = TargetOpcode::G_SUB;
1822     ExtOpcode = TargetOpcode::G_ZEXT;
1823     break;
1824   case TargetOpcode::G_SADDE:
1825     Opcode = TargetOpcode::G_UADDE;
1826     ExtOpcode = TargetOpcode::G_SEXT;
1827     CarryIn = MI.getOperand(4).getReg();
1828     break;
1829   case TargetOpcode::G_SSUBE:
1830     Opcode = TargetOpcode::G_USUBE;
1831     ExtOpcode = TargetOpcode::G_SEXT;
1832     CarryIn = MI.getOperand(4).getReg();
1833     break;
1834   case TargetOpcode::G_UADDE:
1835     Opcode = TargetOpcode::G_UADDE;
1836     ExtOpcode = TargetOpcode::G_ZEXT;
1837     CarryIn = MI.getOperand(4).getReg();
1838     break;
1839   case TargetOpcode::G_USUBE:
1840     Opcode = TargetOpcode::G_USUBE;
1841     ExtOpcode = TargetOpcode::G_ZEXT;
1842     CarryIn = MI.getOperand(4).getReg();
1843     break;
1844   }
1845 
1846   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1847   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1848   // Do the arithmetic in the larger type.
1849   Register NewOp;
1850   if (CarryIn) {
1851     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1852     NewOp = MIRBuilder
1853                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1854                             {LHSExt, RHSExt, *CarryIn})
1855                 .getReg(0);
1856   } else {
1857     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1858   }
1859   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1860   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1861   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1862   // There is no overflow if the ExtOp is the same as NewOp.
1863   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1864   // Now trunc the NewOp to the original result.
1865   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1866   MI.eraseFromParent();
1867   return Legalized;
1868 }
1869 
1870 LegalizerHelper::LegalizeResult
1871 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1872                                          LLT WideTy) {
1873   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1874                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1875                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1876   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1877                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1878   // We can convert this to:
1879   //   1. Any extend iN to iM
1880   //   2. SHL by M-N
1881   //   3. [US][ADD|SUB|SHL]SAT
1882   //   4. L/ASHR by M-N
1883   //
1884   // It may be more efficient to lower this to a min and a max operation in
1885   // the higher precision arithmetic if the promoted operation isn't legal,
1886   // but this decision is up to the target's lowering request.
1887   Register DstReg = MI.getOperand(0).getReg();
1888 
1889   unsigned NewBits = WideTy.getScalarSizeInBits();
1890   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1891 
1892   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1893   // must not left shift the RHS to preserve the shift amount.
1894   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1895   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1896                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1897   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1898   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1899   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1900 
1901   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1902                                         {ShiftL, ShiftR}, MI.getFlags());
1903 
1904   // Use a shift that will preserve the number of sign bits when the trunc is
1905   // folded away.
1906   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1907                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1908 
1909   MIRBuilder.buildTrunc(DstReg, Result);
1910   MI.eraseFromParent();
1911   return Legalized;
1912 }
1913 
1914 LegalizerHelper::LegalizeResult
1915 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1916                                  LLT WideTy) {
1917   if (TypeIdx == 1)
1918     return UnableToLegalize;
1919 
1920   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1921   Register Result = MI.getOperand(0).getReg();
1922   Register OriginalOverflow = MI.getOperand(1).getReg();
1923   Register LHS = MI.getOperand(2).getReg();
1924   Register RHS = MI.getOperand(3).getReg();
1925   LLT SrcTy = MRI.getType(LHS);
1926   LLT OverflowTy = MRI.getType(OriginalOverflow);
1927   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1928 
1929   // To determine if the result overflowed in the larger type, we extend the
1930   // input to the larger type, do the multiply (checking if it overflows),
1931   // then also check the high bits of the result to see if overflow happened
1932   // there.
1933   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
1934   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
1935   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
1936 
1937   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
1938                                     {LeftOperand, RightOperand});
1939   auto Mul = Mulo->getOperand(0);
1940   MIRBuilder.buildTrunc(Result, Mul);
1941 
1942   MachineInstrBuilder ExtResult;
1943   // Overflow occurred if it occurred in the larger type, or if the high part
1944   // of the result does not zero/sign-extend the low part.  Check this second
1945   // possibility first.
1946   if (IsSigned) {
1947     // For signed, overflow occurred when the high part does not sign-extend
1948     // the low part.
1949     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
1950   } else {
1951     // Unsigned overflow occurred when the high part does not zero-extend the
1952     // low part.
1953     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
1954   }
1955 
1956   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
1957   // so we don't need to check the overflow result of larger type Mulo.
1958   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
1959     auto Overflow =
1960         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
1961     // Finally check if the multiplication in the larger type itself overflowed.
1962     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
1963   } else {
1964     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
1965   }
1966   MI.eraseFromParent();
1967   return Legalized;
1968 }
1969 
1970 LegalizerHelper::LegalizeResult
1971 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
1972   switch (MI.getOpcode()) {
1973   default:
1974     return UnableToLegalize;
1975   case TargetOpcode::G_ATOMICRMW_XCHG:
1976   case TargetOpcode::G_ATOMICRMW_ADD:
1977   case TargetOpcode::G_ATOMICRMW_SUB:
1978   case TargetOpcode::G_ATOMICRMW_AND:
1979   case TargetOpcode::G_ATOMICRMW_OR:
1980   case TargetOpcode::G_ATOMICRMW_XOR:
1981   case TargetOpcode::G_ATOMICRMW_MIN:
1982   case TargetOpcode::G_ATOMICRMW_MAX:
1983   case TargetOpcode::G_ATOMICRMW_UMIN:
1984   case TargetOpcode::G_ATOMICRMW_UMAX:
1985     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
1986     Observer.changingInstr(MI);
1987     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
1988     widenScalarDst(MI, WideTy, 0);
1989     Observer.changedInstr(MI);
1990     return Legalized;
1991   case TargetOpcode::G_ATOMIC_CMPXCHG:
1992     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
1993     Observer.changingInstr(MI);
1994     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
1995     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
1996     widenScalarDst(MI, WideTy, 0);
1997     Observer.changedInstr(MI);
1998     return Legalized;
1999   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2000     if (TypeIdx == 0) {
2001       Observer.changingInstr(MI);
2002       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2003       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2004       widenScalarDst(MI, WideTy, 0);
2005       Observer.changedInstr(MI);
2006       return Legalized;
2007     }
2008     assert(TypeIdx == 1 &&
2009            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2010     Observer.changingInstr(MI);
2011     widenScalarDst(MI, WideTy, 1);
2012     Observer.changedInstr(MI);
2013     return Legalized;
2014   case TargetOpcode::G_EXTRACT:
2015     return widenScalarExtract(MI, TypeIdx, WideTy);
2016   case TargetOpcode::G_INSERT:
2017     return widenScalarInsert(MI, TypeIdx, WideTy);
2018   case TargetOpcode::G_MERGE_VALUES:
2019     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2020   case TargetOpcode::G_UNMERGE_VALUES:
2021     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2022   case TargetOpcode::G_SADDO:
2023   case TargetOpcode::G_SSUBO:
2024   case TargetOpcode::G_UADDO:
2025   case TargetOpcode::G_USUBO:
2026   case TargetOpcode::G_SADDE:
2027   case TargetOpcode::G_SSUBE:
2028   case TargetOpcode::G_UADDE:
2029   case TargetOpcode::G_USUBE:
2030     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2031   case TargetOpcode::G_UMULO:
2032   case TargetOpcode::G_SMULO:
2033     return widenScalarMulo(MI, TypeIdx, WideTy);
2034   case TargetOpcode::G_SADDSAT:
2035   case TargetOpcode::G_SSUBSAT:
2036   case TargetOpcode::G_SSHLSAT:
2037   case TargetOpcode::G_UADDSAT:
2038   case TargetOpcode::G_USUBSAT:
2039   case TargetOpcode::G_USHLSAT:
2040     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2041   case TargetOpcode::G_CTTZ:
2042   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2043   case TargetOpcode::G_CTLZ:
2044   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2045   case TargetOpcode::G_CTPOP: {
2046     if (TypeIdx == 0) {
2047       Observer.changingInstr(MI);
2048       widenScalarDst(MI, WideTy, 0);
2049       Observer.changedInstr(MI);
2050       return Legalized;
2051     }
2052 
2053     Register SrcReg = MI.getOperand(1).getReg();
2054 
2055     // First extend the input.
2056     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2057                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2058                           ? TargetOpcode::G_ANYEXT
2059                           : TargetOpcode::G_ZEXT;
2060     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2061     LLT CurTy = MRI.getType(SrcReg);
2062     unsigned NewOpc = MI.getOpcode();
2063     if (NewOpc == TargetOpcode::G_CTTZ) {
2064       // The count is the same in the larger type except if the original
2065       // value was zero.  This can be handled by setting the bit just off
2066       // the top of the original type.
2067       auto TopBit =
2068           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2069       MIBSrc = MIRBuilder.buildOr(
2070         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2071       // Now we know the operand is non-zero, use the more relaxed opcode.
2072       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2073     }
2074 
2075     // Perform the operation at the larger size.
2076     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2077     // This is already the correct result for CTPOP and CTTZs
2078     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2079         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2080       // The correct result is NewOp - (Difference in widety and current ty).
2081       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2082       MIBNewOp = MIRBuilder.buildSub(
2083           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2084     }
2085 
2086     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2087     MI.eraseFromParent();
2088     return Legalized;
2089   }
2090   case TargetOpcode::G_BSWAP: {
2091     Observer.changingInstr(MI);
2092     Register DstReg = MI.getOperand(0).getReg();
2093 
2094     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2095     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2096     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2097     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2098 
2099     MI.getOperand(0).setReg(DstExt);
2100 
2101     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2102 
2103     LLT Ty = MRI.getType(DstReg);
2104     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2105     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2106     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2107 
2108     MIRBuilder.buildTrunc(DstReg, ShrReg);
2109     Observer.changedInstr(MI);
2110     return Legalized;
2111   }
2112   case TargetOpcode::G_BITREVERSE: {
2113     Observer.changingInstr(MI);
2114 
2115     Register DstReg = MI.getOperand(0).getReg();
2116     LLT Ty = MRI.getType(DstReg);
2117     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2118 
2119     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2120     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2121     MI.getOperand(0).setReg(DstExt);
2122     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2123 
2124     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2125     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2126     MIRBuilder.buildTrunc(DstReg, Shift);
2127     Observer.changedInstr(MI);
2128     return Legalized;
2129   }
2130   case TargetOpcode::G_FREEZE:
2131     Observer.changingInstr(MI);
2132     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2133     widenScalarDst(MI, WideTy);
2134     Observer.changedInstr(MI);
2135     return Legalized;
2136 
2137   case TargetOpcode::G_ABS:
2138     Observer.changingInstr(MI);
2139     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2140     widenScalarDst(MI, WideTy);
2141     Observer.changedInstr(MI);
2142     return Legalized;
2143 
2144   case TargetOpcode::G_ADD:
2145   case TargetOpcode::G_AND:
2146   case TargetOpcode::G_MUL:
2147   case TargetOpcode::G_OR:
2148   case TargetOpcode::G_XOR:
2149   case TargetOpcode::G_SUB:
2150     // Perform operation at larger width (any extension is fines here, high bits
2151     // don't affect the result) and then truncate the result back to the
2152     // original type.
2153     Observer.changingInstr(MI);
2154     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2155     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2156     widenScalarDst(MI, WideTy);
2157     Observer.changedInstr(MI);
2158     return Legalized;
2159 
2160   case TargetOpcode::G_SBFX:
2161   case TargetOpcode::G_UBFX:
2162     Observer.changingInstr(MI);
2163 
2164     if (TypeIdx == 0) {
2165       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2166       widenScalarDst(MI, WideTy);
2167     } else {
2168       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2169       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2170     }
2171 
2172     Observer.changedInstr(MI);
2173     return Legalized;
2174 
2175   case TargetOpcode::G_SHL:
2176     Observer.changingInstr(MI);
2177 
2178     if (TypeIdx == 0) {
2179       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2180       widenScalarDst(MI, WideTy);
2181     } else {
2182       assert(TypeIdx == 1);
2183       // The "number of bits to shift" operand must preserve its value as an
2184       // unsigned integer:
2185       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2186     }
2187 
2188     Observer.changedInstr(MI);
2189     return Legalized;
2190 
2191   case TargetOpcode::G_SDIV:
2192   case TargetOpcode::G_SREM:
2193   case TargetOpcode::G_SMIN:
2194   case TargetOpcode::G_SMAX:
2195     Observer.changingInstr(MI);
2196     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2197     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2198     widenScalarDst(MI, WideTy);
2199     Observer.changedInstr(MI);
2200     return Legalized;
2201 
2202   case TargetOpcode::G_SDIVREM:
2203     Observer.changingInstr(MI);
2204     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2205     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2206     widenScalarDst(MI, WideTy);
2207     widenScalarDst(MI, WideTy, 1);
2208     Observer.changedInstr(MI);
2209     return Legalized;
2210 
2211   case TargetOpcode::G_ASHR:
2212   case TargetOpcode::G_LSHR:
2213     Observer.changingInstr(MI);
2214 
2215     if (TypeIdx == 0) {
2216       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2217         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2218 
2219       widenScalarSrc(MI, WideTy, 1, CvtOp);
2220       widenScalarDst(MI, WideTy);
2221     } else {
2222       assert(TypeIdx == 1);
2223       // The "number of bits to shift" operand must preserve its value as an
2224       // unsigned integer:
2225       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2226     }
2227 
2228     Observer.changedInstr(MI);
2229     return Legalized;
2230   case TargetOpcode::G_UDIV:
2231   case TargetOpcode::G_UREM:
2232   case TargetOpcode::G_UMIN:
2233   case TargetOpcode::G_UMAX:
2234     Observer.changingInstr(MI);
2235     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2236     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2237     widenScalarDst(MI, WideTy);
2238     Observer.changedInstr(MI);
2239     return Legalized;
2240 
2241   case TargetOpcode::G_UDIVREM:
2242     Observer.changingInstr(MI);
2243     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2244     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2245     widenScalarDst(MI, WideTy);
2246     widenScalarDst(MI, WideTy, 1);
2247     Observer.changedInstr(MI);
2248     return Legalized;
2249 
2250   case TargetOpcode::G_SELECT:
2251     Observer.changingInstr(MI);
2252     if (TypeIdx == 0) {
2253       // Perform operation at larger width (any extension is fine here, high
2254       // bits don't affect the result) and then truncate the result back to the
2255       // original type.
2256       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2257       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2258       widenScalarDst(MI, WideTy);
2259     } else {
2260       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2261       // Explicit extension is required here since high bits affect the result.
2262       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2263     }
2264     Observer.changedInstr(MI);
2265     return Legalized;
2266 
2267   case TargetOpcode::G_FPTOSI:
2268   case TargetOpcode::G_FPTOUI:
2269     Observer.changingInstr(MI);
2270 
2271     if (TypeIdx == 0)
2272       widenScalarDst(MI, WideTy);
2273     else
2274       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2275 
2276     Observer.changedInstr(MI);
2277     return Legalized;
2278   case TargetOpcode::G_SITOFP:
2279     Observer.changingInstr(MI);
2280 
2281     if (TypeIdx == 0)
2282       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2283     else
2284       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2285 
2286     Observer.changedInstr(MI);
2287     return Legalized;
2288   case TargetOpcode::G_UITOFP:
2289     Observer.changingInstr(MI);
2290 
2291     if (TypeIdx == 0)
2292       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2293     else
2294       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2295 
2296     Observer.changedInstr(MI);
2297     return Legalized;
2298   case TargetOpcode::G_LOAD:
2299   case TargetOpcode::G_SEXTLOAD:
2300   case TargetOpcode::G_ZEXTLOAD:
2301     Observer.changingInstr(MI);
2302     widenScalarDst(MI, WideTy);
2303     Observer.changedInstr(MI);
2304     return Legalized;
2305 
2306   case TargetOpcode::G_STORE: {
2307     if (TypeIdx != 0)
2308       return UnableToLegalize;
2309 
2310     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2311     if (!Ty.isScalar())
2312       return UnableToLegalize;
2313 
2314     Observer.changingInstr(MI);
2315 
2316     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2317       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2318     widenScalarSrc(MI, WideTy, 0, ExtType);
2319 
2320     Observer.changedInstr(MI);
2321     return Legalized;
2322   }
2323   case TargetOpcode::G_CONSTANT: {
2324     MachineOperand &SrcMO = MI.getOperand(1);
2325     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2326     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2327         MRI.getType(MI.getOperand(0).getReg()));
2328     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2329             ExtOpc == TargetOpcode::G_ANYEXT) &&
2330            "Illegal Extend");
2331     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2332     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2333                            ? SrcVal.sext(WideTy.getSizeInBits())
2334                            : SrcVal.zext(WideTy.getSizeInBits());
2335     Observer.changingInstr(MI);
2336     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2337 
2338     widenScalarDst(MI, WideTy);
2339     Observer.changedInstr(MI);
2340     return Legalized;
2341   }
2342   case TargetOpcode::G_FCONSTANT: {
2343     MachineOperand &SrcMO = MI.getOperand(1);
2344     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2345     APFloat Val = SrcMO.getFPImm()->getValueAPF();
2346     bool LosesInfo;
2347     switch (WideTy.getSizeInBits()) {
2348     case 32:
2349       Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
2350                   &LosesInfo);
2351       break;
2352     case 64:
2353       Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
2354                   &LosesInfo);
2355       break;
2356     default:
2357       return UnableToLegalize;
2358     }
2359 
2360     assert(!LosesInfo && "extend should always be lossless");
2361 
2362     Observer.changingInstr(MI);
2363     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
2364 
2365     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2366     Observer.changedInstr(MI);
2367     return Legalized;
2368   }
2369   case TargetOpcode::G_IMPLICIT_DEF: {
2370     Observer.changingInstr(MI);
2371     widenScalarDst(MI, WideTy);
2372     Observer.changedInstr(MI);
2373     return Legalized;
2374   }
2375   case TargetOpcode::G_BRCOND:
2376     Observer.changingInstr(MI);
2377     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2378     Observer.changedInstr(MI);
2379     return Legalized;
2380 
2381   case TargetOpcode::G_FCMP:
2382     Observer.changingInstr(MI);
2383     if (TypeIdx == 0)
2384       widenScalarDst(MI, WideTy);
2385     else {
2386       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2387       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2388     }
2389     Observer.changedInstr(MI);
2390     return Legalized;
2391 
2392   case TargetOpcode::G_ICMP:
2393     Observer.changingInstr(MI);
2394     if (TypeIdx == 0)
2395       widenScalarDst(MI, WideTy);
2396     else {
2397       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2398                                MI.getOperand(1).getPredicate()))
2399                                ? TargetOpcode::G_SEXT
2400                                : TargetOpcode::G_ZEXT;
2401       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2402       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2403     }
2404     Observer.changedInstr(MI);
2405     return Legalized;
2406 
2407   case TargetOpcode::G_PTR_ADD:
2408     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2409     Observer.changingInstr(MI);
2410     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2411     Observer.changedInstr(MI);
2412     return Legalized;
2413 
2414   case TargetOpcode::G_PHI: {
2415     assert(TypeIdx == 0 && "Expecting only Idx 0");
2416 
2417     Observer.changingInstr(MI);
2418     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2419       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2420       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2421       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2422     }
2423 
2424     MachineBasicBlock &MBB = *MI.getParent();
2425     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2426     widenScalarDst(MI, WideTy);
2427     Observer.changedInstr(MI);
2428     return Legalized;
2429   }
2430   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2431     if (TypeIdx == 0) {
2432       Register VecReg = MI.getOperand(1).getReg();
2433       LLT VecTy = MRI.getType(VecReg);
2434       Observer.changingInstr(MI);
2435 
2436       widenScalarSrc(
2437           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2438           TargetOpcode::G_SEXT);
2439 
2440       widenScalarDst(MI, WideTy, 0);
2441       Observer.changedInstr(MI);
2442       return Legalized;
2443     }
2444 
2445     if (TypeIdx != 2)
2446       return UnableToLegalize;
2447     Observer.changingInstr(MI);
2448     // TODO: Probably should be zext
2449     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2450     Observer.changedInstr(MI);
2451     return Legalized;
2452   }
2453   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2454     if (TypeIdx == 1) {
2455       Observer.changingInstr(MI);
2456 
2457       Register VecReg = MI.getOperand(1).getReg();
2458       LLT VecTy = MRI.getType(VecReg);
2459       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2460 
2461       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2462       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2463       widenScalarDst(MI, WideVecTy, 0);
2464       Observer.changedInstr(MI);
2465       return Legalized;
2466     }
2467 
2468     if (TypeIdx == 2) {
2469       Observer.changingInstr(MI);
2470       // TODO: Probably should be zext
2471       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2472       Observer.changedInstr(MI);
2473       return Legalized;
2474     }
2475 
2476     return UnableToLegalize;
2477   }
2478   case TargetOpcode::G_FADD:
2479   case TargetOpcode::G_FMUL:
2480   case TargetOpcode::G_FSUB:
2481   case TargetOpcode::G_FMA:
2482   case TargetOpcode::G_FMAD:
2483   case TargetOpcode::G_FNEG:
2484   case TargetOpcode::G_FABS:
2485   case TargetOpcode::G_FCANONICALIZE:
2486   case TargetOpcode::G_FMINNUM:
2487   case TargetOpcode::G_FMAXNUM:
2488   case TargetOpcode::G_FMINNUM_IEEE:
2489   case TargetOpcode::G_FMAXNUM_IEEE:
2490   case TargetOpcode::G_FMINIMUM:
2491   case TargetOpcode::G_FMAXIMUM:
2492   case TargetOpcode::G_FDIV:
2493   case TargetOpcode::G_FREM:
2494   case TargetOpcode::G_FCEIL:
2495   case TargetOpcode::G_FFLOOR:
2496   case TargetOpcode::G_FCOS:
2497   case TargetOpcode::G_FSIN:
2498   case TargetOpcode::G_FLOG10:
2499   case TargetOpcode::G_FLOG:
2500   case TargetOpcode::G_FLOG2:
2501   case TargetOpcode::G_FRINT:
2502   case TargetOpcode::G_FNEARBYINT:
2503   case TargetOpcode::G_FSQRT:
2504   case TargetOpcode::G_FEXP:
2505   case TargetOpcode::G_FEXP2:
2506   case TargetOpcode::G_FPOW:
2507   case TargetOpcode::G_INTRINSIC_TRUNC:
2508   case TargetOpcode::G_INTRINSIC_ROUND:
2509   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2510     assert(TypeIdx == 0);
2511     Observer.changingInstr(MI);
2512 
2513     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2514       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2515 
2516     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2517     Observer.changedInstr(MI);
2518     return Legalized;
2519   case TargetOpcode::G_FPOWI: {
2520     if (TypeIdx != 0)
2521       return UnableToLegalize;
2522     Observer.changingInstr(MI);
2523     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2524     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2525     Observer.changedInstr(MI);
2526     return Legalized;
2527   }
2528   case TargetOpcode::G_INTTOPTR:
2529     if (TypeIdx != 1)
2530       return UnableToLegalize;
2531 
2532     Observer.changingInstr(MI);
2533     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2534     Observer.changedInstr(MI);
2535     return Legalized;
2536   case TargetOpcode::G_PTRTOINT:
2537     if (TypeIdx != 0)
2538       return UnableToLegalize;
2539 
2540     Observer.changingInstr(MI);
2541     widenScalarDst(MI, WideTy, 0);
2542     Observer.changedInstr(MI);
2543     return Legalized;
2544   case TargetOpcode::G_BUILD_VECTOR: {
2545     Observer.changingInstr(MI);
2546 
2547     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2548     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2549       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2550 
2551     // Avoid changing the result vector type if the source element type was
2552     // requested.
2553     if (TypeIdx == 1) {
2554       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2555     } else {
2556       widenScalarDst(MI, WideTy, 0);
2557     }
2558 
2559     Observer.changedInstr(MI);
2560     return Legalized;
2561   }
2562   case TargetOpcode::G_SEXT_INREG:
2563     if (TypeIdx != 0)
2564       return UnableToLegalize;
2565 
2566     Observer.changingInstr(MI);
2567     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2568     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2569     Observer.changedInstr(MI);
2570     return Legalized;
2571   case TargetOpcode::G_PTRMASK: {
2572     if (TypeIdx != 1)
2573       return UnableToLegalize;
2574     Observer.changingInstr(MI);
2575     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2576     Observer.changedInstr(MI);
2577     return Legalized;
2578   }
2579   }
2580 }
2581 
2582 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2583                              MachineIRBuilder &B, Register Src, LLT Ty) {
2584   auto Unmerge = B.buildUnmerge(Ty, Src);
2585   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2586     Pieces.push_back(Unmerge.getReg(I));
2587 }
2588 
2589 LegalizerHelper::LegalizeResult
2590 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2591   Register Dst = MI.getOperand(0).getReg();
2592   Register Src = MI.getOperand(1).getReg();
2593   LLT DstTy = MRI.getType(Dst);
2594   LLT SrcTy = MRI.getType(Src);
2595 
2596   if (SrcTy.isVector()) {
2597     LLT SrcEltTy = SrcTy.getElementType();
2598     SmallVector<Register, 8> SrcRegs;
2599 
2600     if (DstTy.isVector()) {
2601       int NumDstElt = DstTy.getNumElements();
2602       int NumSrcElt = SrcTy.getNumElements();
2603 
2604       LLT DstEltTy = DstTy.getElementType();
2605       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2606       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2607 
2608       // If there's an element size mismatch, insert intermediate casts to match
2609       // the result element type.
2610       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2611         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2612         //
2613         // =>
2614         //
2615         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2616         // %3:_(<2 x s8>) = G_BITCAST %2
2617         // %4:_(<2 x s8>) = G_BITCAST %3
2618         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2619         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2620         SrcPartTy = SrcEltTy;
2621       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2622         //
2623         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2624         //
2625         // =>
2626         //
2627         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2628         // %3:_(s16) = G_BITCAST %2
2629         // %4:_(s16) = G_BITCAST %3
2630         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2631         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2632         DstCastTy = DstEltTy;
2633       }
2634 
2635       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2636       for (Register &SrcReg : SrcRegs)
2637         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2638     } else
2639       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2640 
2641     MIRBuilder.buildMerge(Dst, SrcRegs);
2642     MI.eraseFromParent();
2643     return Legalized;
2644   }
2645 
2646   if (DstTy.isVector()) {
2647     SmallVector<Register, 8> SrcRegs;
2648     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2649     MIRBuilder.buildMerge(Dst, SrcRegs);
2650     MI.eraseFromParent();
2651     return Legalized;
2652   }
2653 
2654   return UnableToLegalize;
2655 }
2656 
2657 /// Figure out the bit offset into a register when coercing a vector index for
2658 /// the wide element type. This is only for the case when promoting vector to
2659 /// one with larger elements.
2660 //
2661 ///
2662 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2663 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2664 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2665                                                    Register Idx,
2666                                                    unsigned NewEltSize,
2667                                                    unsigned OldEltSize) {
2668   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2669   LLT IdxTy = B.getMRI()->getType(Idx);
2670 
2671   // Now figure out the amount we need to shift to get the target bits.
2672   auto OffsetMask = B.buildConstant(
2673     IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
2674   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2675   return B.buildShl(IdxTy, OffsetIdx,
2676                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2677 }
2678 
2679 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2680 /// is casting to a vector with a smaller element size, perform multiple element
2681 /// extracts and merge the results. If this is coercing to a vector with larger
2682 /// elements, index the bitcasted vector and extract the target element with bit
2683 /// operations. This is intended to force the indexing in the native register
2684 /// size for architectures that can dynamically index the register file.
2685 LegalizerHelper::LegalizeResult
2686 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2687                                          LLT CastTy) {
2688   if (TypeIdx != 1)
2689     return UnableToLegalize;
2690 
2691   Register Dst = MI.getOperand(0).getReg();
2692   Register SrcVec = MI.getOperand(1).getReg();
2693   Register Idx = MI.getOperand(2).getReg();
2694   LLT SrcVecTy = MRI.getType(SrcVec);
2695   LLT IdxTy = MRI.getType(Idx);
2696 
2697   LLT SrcEltTy = SrcVecTy.getElementType();
2698   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2699   unsigned OldNumElts = SrcVecTy.getNumElements();
2700 
2701   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2702   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2703 
2704   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2705   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2706   if (NewNumElts > OldNumElts) {
2707     // Decreasing the vector element size
2708     //
2709     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2710     //  =>
2711     //  v4i32:castx = bitcast x:v2i64
2712     //
2713     // i64 = bitcast
2714     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2715     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2716     //
2717     if (NewNumElts % OldNumElts != 0)
2718       return UnableToLegalize;
2719 
2720     // Type of the intermediate result vector.
2721     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2722     LLT MidTy =
2723         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2724 
2725     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2726 
2727     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2728     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2729 
2730     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2731       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2732       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2733       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2734       NewOps[I] = Elt.getReg(0);
2735     }
2736 
2737     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2738     MIRBuilder.buildBitcast(Dst, NewVec);
2739     MI.eraseFromParent();
2740     return Legalized;
2741   }
2742 
2743   if (NewNumElts < OldNumElts) {
2744     if (NewEltSize % OldEltSize != 0)
2745       return UnableToLegalize;
2746 
2747     // This only depends on powers of 2 because we use bit tricks to figure out
2748     // the bit offset we need to shift to get the target element. A general
2749     // expansion could emit division/multiply.
2750     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2751       return UnableToLegalize;
2752 
2753     // Increasing the vector element size.
2754     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2755     //
2756     //   =>
2757     //
2758     // %cast = G_BITCAST %vec
2759     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2760     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2761     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2762     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2763     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2764     // %elt = G_TRUNC %elt_bits
2765 
2766     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2767     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2768 
2769     // Divide to get the index in the wider element type.
2770     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2771 
2772     Register WideElt = CastVec;
2773     if (CastTy.isVector()) {
2774       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2775                                                      ScaledIdx).getReg(0);
2776     }
2777 
2778     // Compute the bit offset into the register of the target element.
2779     Register OffsetBits = getBitcastWiderVectorElementOffset(
2780       MIRBuilder, Idx, NewEltSize, OldEltSize);
2781 
2782     // Shift the wide element to get the target element.
2783     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2784     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2785     MI.eraseFromParent();
2786     return Legalized;
2787   }
2788 
2789   return UnableToLegalize;
2790 }
2791 
2792 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2793 /// TargetReg, while preserving other bits in \p TargetReg.
2794 ///
2795 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2796 static Register buildBitFieldInsert(MachineIRBuilder &B,
2797                                     Register TargetReg, Register InsertReg,
2798                                     Register OffsetBits) {
2799   LLT TargetTy = B.getMRI()->getType(TargetReg);
2800   LLT InsertTy = B.getMRI()->getType(InsertReg);
2801   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2802   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2803 
2804   // Produce a bitmask of the value to insert
2805   auto EltMask = B.buildConstant(
2806     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2807                                    InsertTy.getSizeInBits()));
2808   // Shift it into position
2809   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2810   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2811 
2812   // Clear out the bits in the wide element
2813   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2814 
2815   // The value to insert has all zeros already, so stick it into the masked
2816   // wide element.
2817   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2818 }
2819 
2820 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2821 /// is increasing the element size, perform the indexing in the target element
2822 /// type, and use bit operations to insert at the element position. This is
2823 /// intended for architectures that can dynamically index the register file and
2824 /// want to force indexing in the native register size.
2825 LegalizerHelper::LegalizeResult
2826 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2827                                         LLT CastTy) {
2828   if (TypeIdx != 0)
2829     return UnableToLegalize;
2830 
2831   Register Dst = MI.getOperand(0).getReg();
2832   Register SrcVec = MI.getOperand(1).getReg();
2833   Register Val = MI.getOperand(2).getReg();
2834   Register Idx = MI.getOperand(3).getReg();
2835 
2836   LLT VecTy = MRI.getType(Dst);
2837   LLT IdxTy = MRI.getType(Idx);
2838 
2839   LLT VecEltTy = VecTy.getElementType();
2840   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2841   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2842   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2843 
2844   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2845   unsigned OldNumElts = VecTy.getNumElements();
2846 
2847   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2848   if (NewNumElts < OldNumElts) {
2849     if (NewEltSize % OldEltSize != 0)
2850       return UnableToLegalize;
2851 
2852     // This only depends on powers of 2 because we use bit tricks to figure out
2853     // the bit offset we need to shift to get the target element. A general
2854     // expansion could emit division/multiply.
2855     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2856       return UnableToLegalize;
2857 
2858     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2859     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2860 
2861     // Divide to get the index in the wider element type.
2862     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2863 
2864     Register ExtractedElt = CastVec;
2865     if (CastTy.isVector()) {
2866       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2867                                                           ScaledIdx).getReg(0);
2868     }
2869 
2870     // Compute the bit offset into the register of the target element.
2871     Register OffsetBits = getBitcastWiderVectorElementOffset(
2872       MIRBuilder, Idx, NewEltSize, OldEltSize);
2873 
2874     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2875                                                Val, OffsetBits);
2876     if (CastTy.isVector()) {
2877       InsertedElt = MIRBuilder.buildInsertVectorElement(
2878         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2879     }
2880 
2881     MIRBuilder.buildBitcast(Dst, InsertedElt);
2882     MI.eraseFromParent();
2883     return Legalized;
2884   }
2885 
2886   return UnableToLegalize;
2887 }
2888 
2889 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2890   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2891   Register DstReg = LoadMI.getDstReg();
2892   Register PtrReg = LoadMI.getPointerReg();
2893   LLT DstTy = MRI.getType(DstReg);
2894   MachineMemOperand &MMO = LoadMI.getMMO();
2895   LLT MemTy = MMO.getMemoryType();
2896   MachineFunction &MF = MIRBuilder.getMF();
2897 
2898   unsigned MemSizeInBits = MemTy.getSizeInBits();
2899   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2900 
2901   if (MemSizeInBits != MemStoreSizeInBits) {
2902     if (MemTy.isVector())
2903       return UnableToLegalize;
2904 
2905     // Promote to a byte-sized load if not loading an integral number of
2906     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2907     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2908     MachineMemOperand *NewMMO =
2909         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2910 
2911     Register LoadReg = DstReg;
2912     LLT LoadTy = DstTy;
2913 
2914     // If this wasn't already an extending load, we need to widen the result
2915     // register to avoid creating a load with a narrower result than the source.
2916     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2917       LoadTy = WideMemTy;
2918       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2919     }
2920 
2921     if (isa<GSExtLoad>(LoadMI)) {
2922       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2923       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2924     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) {
2925       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2926       // The extra bits are guaranteed to be zero, since we stored them that
2927       // way.  A zext load from Wide thus automatically gives zext from MemVT.
2928       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
2929     } else {
2930       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
2931     }
2932 
2933     if (DstTy != LoadTy)
2934       MIRBuilder.buildTrunc(DstReg, LoadReg);
2935 
2936     LoadMI.eraseFromParent();
2937     return Legalized;
2938   }
2939 
2940   // Big endian lowering not implemented.
2941   if (MIRBuilder.getDataLayout().isBigEndian())
2942     return UnableToLegalize;
2943 
2944   // This load needs splitting into power of 2 sized loads.
2945   //
2946   // Our strategy here is to generate anyextending loads for the smaller
2947   // types up to next power-2 result type, and then combine the two larger
2948   // result values together, before truncating back down to the non-pow-2
2949   // type.
2950   // E.g. v1 = i24 load =>
2951   // v2 = i32 zextload (2 byte)
2952   // v3 = i32 load (1 byte)
2953   // v4 = i32 shl v3, 16
2954   // v5 = i32 or v4, v2
2955   // v1 = i24 trunc v5
2956   // By doing this we generate the correct truncate which should get
2957   // combined away as an artifact with a matching extend.
2958 
2959   uint64_t LargeSplitSize, SmallSplitSize;
2960 
2961   if (!isPowerOf2_32(MemSizeInBits)) {
2962     // This load needs splitting into power of 2 sized loads.
2963     LargeSplitSize = PowerOf2Floor(MemSizeInBits);
2964     SmallSplitSize = MemSizeInBits - LargeSplitSize;
2965   } else {
2966     // This is already a power of 2, but we still need to split this in half.
2967     //
2968     // Assume we're being asked to decompose an unaligned load.
2969     // TODO: If this requires multiple splits, handle them all at once.
2970     auto &Ctx = MF.getFunction().getContext();
2971     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
2972       return UnableToLegalize;
2973 
2974     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
2975   }
2976 
2977   if (MemTy.isVector()) {
2978     // TODO: Handle vector extloads
2979     if (MemTy != DstTy)
2980       return UnableToLegalize;
2981 
2982     // TODO: We can do better than scalarizing the vector and at least split it
2983     // in half.
2984     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
2985   }
2986 
2987   MachineMemOperand *LargeMMO =
2988       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
2989   MachineMemOperand *SmallMMO =
2990       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
2991 
2992   LLT PtrTy = MRI.getType(PtrReg);
2993   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
2994   LLT AnyExtTy = LLT::scalar(AnyExtSize);
2995   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
2996                                              PtrReg, *LargeMMO);
2997 
2998   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
2999                                             LargeSplitSize / 8);
3000   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3001   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3002   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3003                                              SmallPtr, *SmallMMO);
3004 
3005   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3006   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3007 
3008   if (AnyExtTy == DstTy)
3009     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3010   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3011     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3012     MIRBuilder.buildTrunc(DstReg, {Or});
3013   } else {
3014     assert(DstTy.isPointer() && "expected pointer");
3015     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3016 
3017     // FIXME: We currently consider this to be illegal for non-integral address
3018     // spaces, but we need still need a way to reinterpret the bits.
3019     MIRBuilder.buildIntToPtr(DstReg, Or);
3020   }
3021 
3022   LoadMI.eraseFromParent();
3023   return Legalized;
3024 }
3025 
3026 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3027   // Lower a non-power of 2 store into multiple pow-2 stores.
3028   // E.g. split an i24 store into an i16 store + i8 store.
3029   // We do this by first extending the stored value to the next largest power
3030   // of 2 type, and then using truncating stores to store the components.
3031   // By doing this, likewise with G_LOAD, generate an extend that can be
3032   // artifact-combined away instead of leaving behind extracts.
3033   Register SrcReg = StoreMI.getValueReg();
3034   Register PtrReg = StoreMI.getPointerReg();
3035   LLT SrcTy = MRI.getType(SrcReg);
3036   MachineFunction &MF = MIRBuilder.getMF();
3037   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3038   LLT MemTy = MMO.getMemoryType();
3039 
3040   unsigned StoreWidth = MemTy.getSizeInBits();
3041   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3042 
3043   if (StoreWidth != StoreSizeInBits) {
3044     if (SrcTy.isVector())
3045       return UnableToLegalize;
3046 
3047     // Promote to a byte-sized store with upper bits zero if not
3048     // storing an integral number of bytes.  For example, promote
3049     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3050     LLT WideTy = LLT::scalar(StoreSizeInBits);
3051 
3052     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3053       // Avoid creating a store with a narrower source than result.
3054       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3055       SrcTy = WideTy;
3056     }
3057 
3058     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3059 
3060     MachineMemOperand *NewMMO =
3061         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3062     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3063     StoreMI.eraseFromParent();
3064     return Legalized;
3065   }
3066 
3067   if (MemTy.isVector()) {
3068     // TODO: Handle vector trunc stores
3069     if (MemTy != SrcTy)
3070       return UnableToLegalize;
3071 
3072     // TODO: We can do better than scalarizing the vector and at least split it
3073     // in half.
3074     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3075   }
3076 
3077   unsigned MemSizeInBits = MemTy.getSizeInBits();
3078   uint64_t LargeSplitSize, SmallSplitSize;
3079 
3080   if (!isPowerOf2_32(MemSizeInBits)) {
3081     LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
3082     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3083   } else {
3084     auto &Ctx = MF.getFunction().getContext();
3085     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3086       return UnableToLegalize; // Don't know what we're being asked to do.
3087 
3088     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3089   }
3090 
3091   // Extend to the next pow-2. If this store was itself the result of lowering,
3092   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3093   // that's wider than the stored size.
3094   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3095   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3096 
3097   if (SrcTy.isPointer()) {
3098     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3099     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3100   }
3101 
3102   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3103 
3104   // Obtain the smaller value by shifting away the larger value.
3105   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3106   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3107 
3108   // Generate the PtrAdd and truncating stores.
3109   LLT PtrTy = MRI.getType(PtrReg);
3110   auto OffsetCst = MIRBuilder.buildConstant(
3111     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3112   auto SmallPtr =
3113     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3114 
3115   MachineMemOperand *LargeMMO =
3116     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3117   MachineMemOperand *SmallMMO =
3118     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3119   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3120   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3121   StoreMI.eraseFromParent();
3122   return Legalized;
3123 }
3124 
3125 LegalizerHelper::LegalizeResult
3126 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3127   switch (MI.getOpcode()) {
3128   case TargetOpcode::G_LOAD: {
3129     if (TypeIdx != 0)
3130       return UnableToLegalize;
3131     MachineMemOperand &MMO = **MI.memoperands_begin();
3132 
3133     // Not sure how to interpret a bitcast of an extending load.
3134     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3135       return UnableToLegalize;
3136 
3137     Observer.changingInstr(MI);
3138     bitcastDst(MI, CastTy, 0);
3139     MMO.setType(CastTy);
3140     Observer.changedInstr(MI);
3141     return Legalized;
3142   }
3143   case TargetOpcode::G_STORE: {
3144     if (TypeIdx != 0)
3145       return UnableToLegalize;
3146 
3147     MachineMemOperand &MMO = **MI.memoperands_begin();
3148 
3149     // Not sure how to interpret a bitcast of a truncating store.
3150     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3151       return UnableToLegalize;
3152 
3153     Observer.changingInstr(MI);
3154     bitcastSrc(MI, CastTy, 0);
3155     MMO.setType(CastTy);
3156     Observer.changedInstr(MI);
3157     return Legalized;
3158   }
3159   case TargetOpcode::G_SELECT: {
3160     if (TypeIdx != 0)
3161       return UnableToLegalize;
3162 
3163     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3164       LLVM_DEBUG(
3165           dbgs() << "bitcast action not implemented for vector select\n");
3166       return UnableToLegalize;
3167     }
3168 
3169     Observer.changingInstr(MI);
3170     bitcastSrc(MI, CastTy, 2);
3171     bitcastSrc(MI, CastTy, 3);
3172     bitcastDst(MI, CastTy, 0);
3173     Observer.changedInstr(MI);
3174     return Legalized;
3175   }
3176   case TargetOpcode::G_AND:
3177   case TargetOpcode::G_OR:
3178   case TargetOpcode::G_XOR: {
3179     Observer.changingInstr(MI);
3180     bitcastSrc(MI, CastTy, 1);
3181     bitcastSrc(MI, CastTy, 2);
3182     bitcastDst(MI, CastTy, 0);
3183     Observer.changedInstr(MI);
3184     return Legalized;
3185   }
3186   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3187     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3188   case TargetOpcode::G_INSERT_VECTOR_ELT:
3189     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3190   default:
3191     return UnableToLegalize;
3192   }
3193 }
3194 
3195 // Legalize an instruction by changing the opcode in place.
3196 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3197     Observer.changingInstr(MI);
3198     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3199     Observer.changedInstr(MI);
3200 }
3201 
3202 LegalizerHelper::LegalizeResult
3203 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3204   using namespace TargetOpcode;
3205 
3206   switch(MI.getOpcode()) {
3207   default:
3208     return UnableToLegalize;
3209   case TargetOpcode::G_BITCAST:
3210     return lowerBitcast(MI);
3211   case TargetOpcode::G_SREM:
3212   case TargetOpcode::G_UREM: {
3213     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3214     auto Quot =
3215         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3216                               {MI.getOperand(1), MI.getOperand(2)});
3217 
3218     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3219     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3220     MI.eraseFromParent();
3221     return Legalized;
3222   }
3223   case TargetOpcode::G_SADDO:
3224   case TargetOpcode::G_SSUBO:
3225     return lowerSADDO_SSUBO(MI);
3226   case TargetOpcode::G_UMULH:
3227   case TargetOpcode::G_SMULH:
3228     return lowerSMULH_UMULH(MI);
3229   case TargetOpcode::G_SMULO:
3230   case TargetOpcode::G_UMULO: {
3231     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3232     // result.
3233     Register Res = MI.getOperand(0).getReg();
3234     Register Overflow = MI.getOperand(1).getReg();
3235     Register LHS = MI.getOperand(2).getReg();
3236     Register RHS = MI.getOperand(3).getReg();
3237     LLT Ty = MRI.getType(Res);
3238 
3239     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3240                           ? TargetOpcode::G_SMULH
3241                           : TargetOpcode::G_UMULH;
3242 
3243     Observer.changingInstr(MI);
3244     const auto &TII = MIRBuilder.getTII();
3245     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3246     MI.RemoveOperand(1);
3247     Observer.changedInstr(MI);
3248 
3249     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3250     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3251 
3252     // Move insert point forward so we can use the Res register if needed.
3253     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3254 
3255     // For *signed* multiply, overflow is detected by checking:
3256     // (hi != (lo >> bitwidth-1))
3257     if (Opcode == TargetOpcode::G_SMULH) {
3258       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3259       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3260       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3261     } else {
3262       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3263     }
3264     return Legalized;
3265   }
3266   case TargetOpcode::G_FNEG: {
3267     Register Res = MI.getOperand(0).getReg();
3268     LLT Ty = MRI.getType(Res);
3269 
3270     // TODO: Handle vector types once we are able to
3271     // represent them.
3272     if (Ty.isVector())
3273       return UnableToLegalize;
3274     auto SignMask =
3275         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3276     Register SubByReg = MI.getOperand(1).getReg();
3277     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3278     MI.eraseFromParent();
3279     return Legalized;
3280   }
3281   case TargetOpcode::G_FSUB: {
3282     Register Res = MI.getOperand(0).getReg();
3283     LLT Ty = MRI.getType(Res);
3284 
3285     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3286     // First, check if G_FNEG is marked as Lower. If so, we may
3287     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3288     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3289       return UnableToLegalize;
3290     Register LHS = MI.getOperand(1).getReg();
3291     Register RHS = MI.getOperand(2).getReg();
3292     Register Neg = MRI.createGenericVirtualRegister(Ty);
3293     MIRBuilder.buildFNeg(Neg, RHS);
3294     MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3295     MI.eraseFromParent();
3296     return Legalized;
3297   }
3298   case TargetOpcode::G_FMAD:
3299     return lowerFMad(MI);
3300   case TargetOpcode::G_FFLOOR:
3301     return lowerFFloor(MI);
3302   case TargetOpcode::G_INTRINSIC_ROUND:
3303     return lowerIntrinsicRound(MI);
3304   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3305     // Since round even is the assumed rounding mode for unconstrained FP
3306     // operations, rint and roundeven are the same operation.
3307     changeOpcode(MI, TargetOpcode::G_FRINT);
3308     return Legalized;
3309   }
3310   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3311     Register OldValRes = MI.getOperand(0).getReg();
3312     Register SuccessRes = MI.getOperand(1).getReg();
3313     Register Addr = MI.getOperand(2).getReg();
3314     Register CmpVal = MI.getOperand(3).getReg();
3315     Register NewVal = MI.getOperand(4).getReg();
3316     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3317                                   **MI.memoperands_begin());
3318     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3319     MI.eraseFromParent();
3320     return Legalized;
3321   }
3322   case TargetOpcode::G_LOAD:
3323   case TargetOpcode::G_SEXTLOAD:
3324   case TargetOpcode::G_ZEXTLOAD:
3325     return lowerLoad(cast<GAnyLoad>(MI));
3326   case TargetOpcode::G_STORE:
3327     return lowerStore(cast<GStore>(MI));
3328   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3329   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3330   case TargetOpcode::G_CTLZ:
3331   case TargetOpcode::G_CTTZ:
3332   case TargetOpcode::G_CTPOP:
3333     return lowerBitCount(MI);
3334   case G_UADDO: {
3335     Register Res = MI.getOperand(0).getReg();
3336     Register CarryOut = MI.getOperand(1).getReg();
3337     Register LHS = MI.getOperand(2).getReg();
3338     Register RHS = MI.getOperand(3).getReg();
3339 
3340     MIRBuilder.buildAdd(Res, LHS, RHS);
3341     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3342 
3343     MI.eraseFromParent();
3344     return Legalized;
3345   }
3346   case G_UADDE: {
3347     Register Res = MI.getOperand(0).getReg();
3348     Register CarryOut = MI.getOperand(1).getReg();
3349     Register LHS = MI.getOperand(2).getReg();
3350     Register RHS = MI.getOperand(3).getReg();
3351     Register CarryIn = MI.getOperand(4).getReg();
3352     LLT Ty = MRI.getType(Res);
3353 
3354     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3355     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3356     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3357     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3358 
3359     MI.eraseFromParent();
3360     return Legalized;
3361   }
3362   case G_USUBO: {
3363     Register Res = MI.getOperand(0).getReg();
3364     Register BorrowOut = MI.getOperand(1).getReg();
3365     Register LHS = MI.getOperand(2).getReg();
3366     Register RHS = MI.getOperand(3).getReg();
3367 
3368     MIRBuilder.buildSub(Res, LHS, RHS);
3369     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3370 
3371     MI.eraseFromParent();
3372     return Legalized;
3373   }
3374   case G_USUBE: {
3375     Register Res = MI.getOperand(0).getReg();
3376     Register BorrowOut = MI.getOperand(1).getReg();
3377     Register LHS = MI.getOperand(2).getReg();
3378     Register RHS = MI.getOperand(3).getReg();
3379     Register BorrowIn = MI.getOperand(4).getReg();
3380     const LLT CondTy = MRI.getType(BorrowOut);
3381     const LLT Ty = MRI.getType(Res);
3382 
3383     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3384     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3385     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3386 
3387     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3388     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3389     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3390 
3391     MI.eraseFromParent();
3392     return Legalized;
3393   }
3394   case G_UITOFP:
3395     return lowerUITOFP(MI);
3396   case G_SITOFP:
3397     return lowerSITOFP(MI);
3398   case G_FPTOUI:
3399     return lowerFPTOUI(MI);
3400   case G_FPTOSI:
3401     return lowerFPTOSI(MI);
3402   case G_FPTRUNC:
3403     return lowerFPTRUNC(MI);
3404   case G_FPOWI:
3405     return lowerFPOWI(MI);
3406   case G_SMIN:
3407   case G_SMAX:
3408   case G_UMIN:
3409   case G_UMAX:
3410     return lowerMinMax(MI);
3411   case G_FCOPYSIGN:
3412     return lowerFCopySign(MI);
3413   case G_FMINNUM:
3414   case G_FMAXNUM:
3415     return lowerFMinNumMaxNum(MI);
3416   case G_MERGE_VALUES:
3417     return lowerMergeValues(MI);
3418   case G_UNMERGE_VALUES:
3419     return lowerUnmergeValues(MI);
3420   case TargetOpcode::G_SEXT_INREG: {
3421     assert(MI.getOperand(2).isImm() && "Expected immediate");
3422     int64_t SizeInBits = MI.getOperand(2).getImm();
3423 
3424     Register DstReg = MI.getOperand(0).getReg();
3425     Register SrcReg = MI.getOperand(1).getReg();
3426     LLT DstTy = MRI.getType(DstReg);
3427     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3428 
3429     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3430     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3431     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3432     MI.eraseFromParent();
3433     return Legalized;
3434   }
3435   case G_EXTRACT_VECTOR_ELT:
3436   case G_INSERT_VECTOR_ELT:
3437     return lowerExtractInsertVectorElt(MI);
3438   case G_SHUFFLE_VECTOR:
3439     return lowerShuffleVector(MI);
3440   case G_DYN_STACKALLOC:
3441     return lowerDynStackAlloc(MI);
3442   case G_EXTRACT:
3443     return lowerExtract(MI);
3444   case G_INSERT:
3445     return lowerInsert(MI);
3446   case G_BSWAP:
3447     return lowerBswap(MI);
3448   case G_BITREVERSE:
3449     return lowerBitreverse(MI);
3450   case G_READ_REGISTER:
3451   case G_WRITE_REGISTER:
3452     return lowerReadWriteRegister(MI);
3453   case G_UADDSAT:
3454   case G_USUBSAT: {
3455     // Try to make a reasonable guess about which lowering strategy to use. The
3456     // target can override this with custom lowering and calling the
3457     // implementation functions.
3458     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3459     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3460       return lowerAddSubSatToMinMax(MI);
3461     return lowerAddSubSatToAddoSubo(MI);
3462   }
3463   case G_SADDSAT:
3464   case G_SSUBSAT: {
3465     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3466 
3467     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3468     // since it's a shorter expansion. However, we would need to figure out the
3469     // preferred boolean type for the carry out for the query.
3470     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3471       return lowerAddSubSatToMinMax(MI);
3472     return lowerAddSubSatToAddoSubo(MI);
3473   }
3474   case G_SSHLSAT:
3475   case G_USHLSAT:
3476     return lowerShlSat(MI);
3477   case G_ABS:
3478     return lowerAbsToAddXor(MI);
3479   case G_SELECT:
3480     return lowerSelect(MI);
3481   case G_SDIVREM:
3482   case G_UDIVREM:
3483     return lowerDIVREM(MI);
3484   case G_FSHL:
3485   case G_FSHR:
3486     return lowerFunnelShift(MI);
3487   case G_ROTL:
3488   case G_ROTR:
3489     return lowerRotate(MI);
3490   case G_MEMSET:
3491   case G_MEMCPY:
3492   case G_MEMMOVE:
3493     return lowerMemCpyFamily(MI);
3494   case G_MEMCPY_INLINE:
3495     return lowerMemcpyInline(MI);
3496   GISEL_VECREDUCE_CASES_NONSEQ
3497     return lowerVectorReduction(MI);
3498   }
3499 }
3500 
3501 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3502                                                   Align MinAlign) const {
3503   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3504   // datalayout for the preferred alignment. Also there should be a target hook
3505   // for this to allow targets to reduce the alignment and ignore the
3506   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3507   // the type.
3508   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3509 }
3510 
3511 MachineInstrBuilder
3512 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3513                                       MachinePointerInfo &PtrInfo) {
3514   MachineFunction &MF = MIRBuilder.getMF();
3515   const DataLayout &DL = MIRBuilder.getDataLayout();
3516   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3517 
3518   unsigned AddrSpace = DL.getAllocaAddrSpace();
3519   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3520 
3521   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3522   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3523 }
3524 
3525 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3526                                         LLT VecTy) {
3527   int64_t IdxVal;
3528   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3529     return IdxReg;
3530 
3531   LLT IdxTy = B.getMRI()->getType(IdxReg);
3532   unsigned NElts = VecTy.getNumElements();
3533   if (isPowerOf2_32(NElts)) {
3534     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3535     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3536   }
3537 
3538   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3539       .getReg(0);
3540 }
3541 
3542 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3543                                                   Register Index) {
3544   LLT EltTy = VecTy.getElementType();
3545 
3546   // Calculate the element offset and add it to the pointer.
3547   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3548   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3549          "Converting bits to bytes lost precision");
3550 
3551   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3552 
3553   LLT IdxTy = MRI.getType(Index);
3554   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3555                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3556 
3557   LLT PtrTy = MRI.getType(VecPtr);
3558   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3559 }
3560 
3561 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef(
3562     MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) {
3563   Register DstReg = MI.getOperand(0).getReg();
3564   LLT DstTy = MRI.getType(DstReg);
3565   LLT LCMTy = getLCMType(DstTy, NarrowTy);
3566 
3567   unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
3568 
3569   auto NewUndef = MIRBuilder.buildUndef(NarrowTy);
3570   SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0));
3571 
3572   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
3573   MI.eraseFromParent();
3574   return Legalized;
3575 }
3576 
3577 // Handle splitting vector operations which need to have the same number of
3578 // elements in each type index, but each type index may have a different element
3579 // type.
3580 //
3581 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3582 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3583 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3584 //
3585 // Also handles some irregular breakdown cases, e.g.
3586 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3587 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3588 //             s64 = G_SHL s64, s32
3589 LegalizerHelper::LegalizeResult
3590 LegalizerHelper::fewerElementsVectorMultiEltType(
3591   MachineInstr &MI, unsigned TypeIdx, LLT NarrowTyArg) {
3592   if (TypeIdx != 0)
3593     return UnableToLegalize;
3594 
3595   const LLT NarrowTy0 = NarrowTyArg;
3596   const Register DstReg = MI.getOperand(0).getReg();
3597   LLT DstTy = MRI.getType(DstReg);
3598   LLT LeftoverTy0;
3599 
3600   // All of the operands need to have the same number of elements, so if we can
3601   // determine a type breakdown for the result type, we can for all of the
3602   // source types.
3603   int NumParts = getNarrowTypeBreakDown(DstTy, NarrowTy0, LeftoverTy0).first;
3604   if (NumParts < 0)
3605     return UnableToLegalize;
3606 
3607   SmallVector<MachineInstrBuilder, 4> NewInsts;
3608 
3609   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3610   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3611 
3612   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
3613     Register SrcReg = MI.getOperand(I).getReg();
3614     LLT SrcTyI = MRI.getType(SrcReg);
3615     const auto NewEC = NarrowTy0.isVector() ? NarrowTy0.getElementCount()
3616                                             : ElementCount::getFixed(1);
3617     LLT NarrowTyI = LLT::scalarOrVector(NewEC, SrcTyI.getScalarType());
3618     LLT LeftoverTyI;
3619 
3620     // Split this operand into the requested typed registers, and any leftover
3621     // required to reproduce the original type.
3622     if (!extractParts(SrcReg, SrcTyI, NarrowTyI, LeftoverTyI, PartRegs,
3623                       LeftoverRegs))
3624       return UnableToLegalize;
3625 
3626     if (I == 1) {
3627       // For the first operand, create an instruction for each part and setup
3628       // the result.
3629       for (Register PartReg : PartRegs) {
3630         Register PartDstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3631         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3632                                .addDef(PartDstReg)
3633                                .addUse(PartReg));
3634         DstRegs.push_back(PartDstReg);
3635       }
3636 
3637       for (Register LeftoverReg : LeftoverRegs) {
3638         Register PartDstReg = MRI.createGenericVirtualRegister(LeftoverTy0);
3639         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3640                                .addDef(PartDstReg)
3641                                .addUse(LeftoverReg));
3642         LeftoverDstRegs.push_back(PartDstReg);
3643       }
3644     } else {
3645       assert(NewInsts.size() == PartRegs.size() + LeftoverRegs.size());
3646 
3647       // Add the newly created operand splits to the existing instructions. The
3648       // odd-sized pieces are ordered after the requested NarrowTyArg sized
3649       // pieces.
3650       unsigned InstCount = 0;
3651       for (unsigned J = 0, JE = PartRegs.size(); J != JE; ++J)
3652         NewInsts[InstCount++].addUse(PartRegs[J]);
3653       for (unsigned J = 0, JE = LeftoverRegs.size(); J != JE; ++J)
3654         NewInsts[InstCount++].addUse(LeftoverRegs[J]);
3655     }
3656 
3657     PartRegs.clear();
3658     LeftoverRegs.clear();
3659   }
3660 
3661   // Insert the newly built operations and rebuild the result register.
3662   for (auto &MIB : NewInsts)
3663     MIRBuilder.insertInstr(MIB);
3664 
3665   insertParts(DstReg, DstTy, NarrowTy0, DstRegs, LeftoverTy0, LeftoverDstRegs);
3666 
3667   MI.eraseFromParent();
3668   return Legalized;
3669 }
3670 
3671 LegalizerHelper::LegalizeResult
3672 LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx,
3673                                           LLT NarrowTy) {
3674   if (TypeIdx != 0)
3675     return UnableToLegalize;
3676 
3677   Register DstReg = MI.getOperand(0).getReg();
3678   Register SrcReg = MI.getOperand(1).getReg();
3679   LLT DstTy = MRI.getType(DstReg);
3680   LLT SrcTy = MRI.getType(SrcReg);
3681 
3682   LLT NarrowTy0 = NarrowTy;
3683   LLT NarrowTy1;
3684   unsigned NumParts;
3685 
3686   if (NarrowTy.isVector()) {
3687     // Uneven breakdown not handled.
3688     NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3689     if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements())
3690       return UnableToLegalize;
3691 
3692     NarrowTy1 = LLT::vector(NarrowTy.getElementCount(), SrcTy.getElementType());
3693   } else {
3694     NumParts = DstTy.getNumElements();
3695     NarrowTy1 = SrcTy.getElementType();
3696   }
3697 
3698   SmallVector<Register, 4> SrcRegs, DstRegs;
3699   extractParts(SrcReg, NarrowTy1, NumParts, SrcRegs);
3700 
3701   for (unsigned I = 0; I < NumParts; ++I) {
3702     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3703     MachineInstr *NewInst =
3704         MIRBuilder.buildInstr(MI.getOpcode(), {DstReg}, {SrcRegs[I]});
3705 
3706     NewInst->setFlags(MI.getFlags());
3707     DstRegs.push_back(DstReg);
3708   }
3709 
3710   if (NarrowTy.isVector())
3711     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3712   else
3713     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3714 
3715   MI.eraseFromParent();
3716   return Legalized;
3717 }
3718 
3719 LegalizerHelper::LegalizeResult
3720 LegalizerHelper::fewerElementsVectorCmp(MachineInstr &MI, unsigned TypeIdx,
3721                                         LLT NarrowTy) {
3722   Register DstReg = MI.getOperand(0).getReg();
3723   Register Src0Reg = MI.getOperand(2).getReg();
3724   LLT DstTy = MRI.getType(DstReg);
3725   LLT SrcTy = MRI.getType(Src0Reg);
3726 
3727   unsigned NumParts;
3728   LLT NarrowTy0, NarrowTy1;
3729 
3730   if (TypeIdx == 0) {
3731     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3732     unsigned OldElts = DstTy.getNumElements();
3733 
3734     NarrowTy0 = NarrowTy;
3735     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : DstTy.getNumElements();
3736     NarrowTy1 = NarrowTy.isVector() ? LLT::vector(NarrowTy.getElementCount(),
3737                                                   SrcTy.getScalarSizeInBits())
3738                                     : SrcTy.getElementType();
3739 
3740   } else {
3741     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3742     unsigned OldElts = SrcTy.getNumElements();
3743 
3744     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) :
3745       NarrowTy.getNumElements();
3746     NarrowTy0 =
3747         LLT::vector(NarrowTy.getElementCount(), DstTy.getScalarSizeInBits());
3748     NarrowTy1 = NarrowTy;
3749   }
3750 
3751   // FIXME: Don't know how to handle the situation where the small vectors
3752   // aren't all the same size yet.
3753   if (NarrowTy1.isVector() &&
3754       NarrowTy1.getNumElements() * NumParts != DstTy.getNumElements())
3755     return UnableToLegalize;
3756 
3757   CmpInst::Predicate Pred
3758     = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3759 
3760   SmallVector<Register, 2> Src1Regs, Src2Regs, DstRegs;
3761   extractParts(MI.getOperand(2).getReg(), NarrowTy1, NumParts, Src1Regs);
3762   extractParts(MI.getOperand(3).getReg(), NarrowTy1, NumParts, Src2Regs);
3763 
3764   for (unsigned I = 0; I < NumParts; ++I) {
3765     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3766     DstRegs.push_back(DstReg);
3767 
3768     if (MI.getOpcode() == TargetOpcode::G_ICMP)
3769       MIRBuilder.buildICmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3770     else {
3771       MachineInstr *NewCmp
3772         = MIRBuilder.buildFCmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3773       NewCmp->setFlags(MI.getFlags());
3774     }
3775   }
3776 
3777   if (NarrowTy1.isVector())
3778     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3779   else
3780     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3781 
3782   MI.eraseFromParent();
3783   return Legalized;
3784 }
3785 
3786 LegalizerHelper::LegalizeResult
3787 LegalizerHelper::fewerElementsVectorSelect(MachineInstr &MI, unsigned TypeIdx,
3788                                            LLT NarrowTy) {
3789   Register DstReg = MI.getOperand(0).getReg();
3790   Register CondReg = MI.getOperand(1).getReg();
3791 
3792   unsigned NumParts = 0;
3793   LLT NarrowTy0, NarrowTy1;
3794 
3795   LLT DstTy = MRI.getType(DstReg);
3796   LLT CondTy = MRI.getType(CondReg);
3797   unsigned Size = DstTy.getSizeInBits();
3798 
3799   assert(TypeIdx == 0 || CondTy.isVector());
3800 
3801   if (TypeIdx == 0) {
3802     NarrowTy0 = NarrowTy;
3803     NarrowTy1 = CondTy;
3804 
3805     unsigned NarrowSize = NarrowTy0.getSizeInBits();
3806     // FIXME: Don't know how to handle the situation where the small vectors
3807     // aren't all the same size yet.
3808     if (Size % NarrowSize != 0)
3809       return UnableToLegalize;
3810 
3811     NumParts = Size / NarrowSize;
3812 
3813     // Need to break down the condition type
3814     if (CondTy.isVector()) {
3815       if (CondTy.getNumElements() == NumParts)
3816         NarrowTy1 = CondTy.getElementType();
3817       else
3818         NarrowTy1 =
3819             LLT::vector(CondTy.getElementCount().divideCoefficientBy(NumParts),
3820                         CondTy.getScalarSizeInBits());
3821     }
3822   } else {
3823     NumParts = CondTy.getNumElements();
3824     if (NarrowTy.isVector()) {
3825       // TODO: Handle uneven breakdown.
3826       if (NumParts * NarrowTy.getNumElements() != CondTy.getNumElements())
3827         return UnableToLegalize;
3828 
3829       return UnableToLegalize;
3830     } else {
3831       NarrowTy0 = DstTy.getElementType();
3832       NarrowTy1 = NarrowTy;
3833     }
3834   }
3835 
3836   SmallVector<Register, 2> DstRegs, Src0Regs, Src1Regs, Src2Regs;
3837   if (CondTy.isVector())
3838     extractParts(MI.getOperand(1).getReg(), NarrowTy1, NumParts, Src0Regs);
3839 
3840   extractParts(MI.getOperand(2).getReg(), NarrowTy0, NumParts, Src1Regs);
3841   extractParts(MI.getOperand(3).getReg(), NarrowTy0, NumParts, Src2Regs);
3842 
3843   for (unsigned i = 0; i < NumParts; ++i) {
3844     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3845     MIRBuilder.buildSelect(DstReg, CondTy.isVector() ? Src0Regs[i] : CondReg,
3846                            Src1Regs[i], Src2Regs[i]);
3847     DstRegs.push_back(DstReg);
3848   }
3849 
3850   if (NarrowTy0.isVector())
3851     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3852   else
3853     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3854 
3855   MI.eraseFromParent();
3856   return Legalized;
3857 }
3858 
3859 LegalizerHelper::LegalizeResult
3860 LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
3861                                         LLT NarrowTy) {
3862   const Register DstReg = MI.getOperand(0).getReg();
3863   LLT PhiTy = MRI.getType(DstReg);
3864   LLT LeftoverTy;
3865 
3866   // All of the operands need to have the same number of elements, so if we can
3867   // determine a type breakdown for the result type, we can for all of the
3868   // source types.
3869   int NumParts, NumLeftover;
3870   std::tie(NumParts, NumLeftover)
3871     = getNarrowTypeBreakDown(PhiTy, NarrowTy, LeftoverTy);
3872   if (NumParts < 0)
3873     return UnableToLegalize;
3874 
3875   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3876   SmallVector<MachineInstrBuilder, 4> NewInsts;
3877 
3878   const int TotalNumParts = NumParts + NumLeftover;
3879 
3880   // Insert the new phis in the result block first.
3881   for (int I = 0; I != TotalNumParts; ++I) {
3882     LLT Ty = I < NumParts ? NarrowTy : LeftoverTy;
3883     Register PartDstReg = MRI.createGenericVirtualRegister(Ty);
3884     NewInsts.push_back(MIRBuilder.buildInstr(TargetOpcode::G_PHI)
3885                        .addDef(PartDstReg));
3886     if (I < NumParts)
3887       DstRegs.push_back(PartDstReg);
3888     else
3889       LeftoverDstRegs.push_back(PartDstReg);
3890   }
3891 
3892   MachineBasicBlock *MBB = MI.getParent();
3893   MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI());
3894   insertParts(DstReg, PhiTy, NarrowTy, DstRegs, LeftoverTy, LeftoverDstRegs);
3895 
3896   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3897 
3898   // Insert code to extract the incoming values in each predecessor block.
3899   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3900     PartRegs.clear();
3901     LeftoverRegs.clear();
3902 
3903     Register SrcReg = MI.getOperand(I).getReg();
3904     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
3905     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3906 
3907     LLT Unused;
3908     if (!extractParts(SrcReg, PhiTy, NarrowTy, Unused, PartRegs,
3909                       LeftoverRegs))
3910       return UnableToLegalize;
3911 
3912     // Add the newly created operand splits to the existing instructions. The
3913     // odd-sized pieces are ordered after the requested NarrowTyArg sized
3914     // pieces.
3915     for (int J = 0; J != TotalNumParts; ++J) {
3916       MachineInstrBuilder MIB = NewInsts[J];
3917       MIB.addUse(J < NumParts ? PartRegs[J] : LeftoverRegs[J - NumParts]);
3918       MIB.addMBB(&OpMBB);
3919     }
3920   }
3921 
3922   MI.eraseFromParent();
3923   return Legalized;
3924 }
3925 
3926 LegalizerHelper::LegalizeResult
3927 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3928                                                   unsigned TypeIdx,
3929                                                   LLT NarrowTy) {
3930   if (TypeIdx != 1)
3931     return UnableToLegalize;
3932 
3933   const int NumDst = MI.getNumOperands() - 1;
3934   const Register SrcReg = MI.getOperand(NumDst).getReg();
3935   LLT SrcTy = MRI.getType(SrcReg);
3936 
3937   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3938 
3939   // TODO: Create sequence of extracts.
3940   if (DstTy == NarrowTy)
3941     return UnableToLegalize;
3942 
3943   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3944   if (DstTy == GCDTy) {
3945     // This would just be a copy of the same unmerge.
3946     // TODO: Create extracts, pad with undef and create intermediate merges.
3947     return UnableToLegalize;
3948   }
3949 
3950   auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
3951   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3952   const int PartsPerUnmerge = NumDst / NumUnmerge;
3953 
3954   for (int I = 0; I != NumUnmerge; ++I) {
3955     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3956 
3957     for (int J = 0; J != PartsPerUnmerge; ++J)
3958       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3959     MIB.addUse(Unmerge.getReg(I));
3960   }
3961 
3962   MI.eraseFromParent();
3963   return Legalized;
3964 }
3965 
3966 LegalizerHelper::LegalizeResult
3967 LegalizerHelper::fewerElementsVectorMulo(MachineInstr &MI, unsigned TypeIdx,
3968                                          LLT NarrowTy) {
3969   Register Result = MI.getOperand(0).getReg();
3970   Register Overflow = MI.getOperand(1).getReg();
3971   Register LHS = MI.getOperand(2).getReg();
3972   Register RHS = MI.getOperand(3).getReg();
3973 
3974   LLT SrcTy = MRI.getType(LHS);
3975   if (!SrcTy.isVector())
3976     return UnableToLegalize;
3977 
3978   LLT ElementType = SrcTy.getElementType();
3979   LLT OverflowElementTy = MRI.getType(Overflow).getElementType();
3980   const ElementCount NumResult = SrcTy.getElementCount();
3981   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3982 
3983   // Unmerge the operands to smaller parts of GCD type.
3984   auto UnmergeLHS = MIRBuilder.buildUnmerge(GCDTy, LHS);
3985   auto UnmergeRHS = MIRBuilder.buildUnmerge(GCDTy, RHS);
3986 
3987   const int NumOps = UnmergeLHS->getNumOperands() - 1;
3988   const ElementCount PartsPerUnmerge = NumResult.divideCoefficientBy(NumOps);
3989   LLT OverflowTy = LLT::scalarOrVector(PartsPerUnmerge, OverflowElementTy);
3990   LLT ResultTy = LLT::scalarOrVector(PartsPerUnmerge, ElementType);
3991 
3992   // Perform the operation over unmerged parts.
3993   SmallVector<Register, 8> ResultParts;
3994   SmallVector<Register, 8> OverflowParts;
3995   for (int I = 0; I != NumOps; ++I) {
3996     Register Operand1 = UnmergeLHS->getOperand(I).getReg();
3997     Register Operand2 = UnmergeRHS->getOperand(I).getReg();
3998     auto PartMul = MIRBuilder.buildInstr(MI.getOpcode(), {ResultTy, OverflowTy},
3999                                          {Operand1, Operand2});
4000     ResultParts.push_back(PartMul->getOperand(0).getReg());
4001     OverflowParts.push_back(PartMul->getOperand(1).getReg());
4002   }
4003 
4004   LLT ResultLCMTy = buildLCMMergePieces(SrcTy, NarrowTy, GCDTy, ResultParts);
4005   LLT OverflowLCMTy =
4006       LLT::scalarOrVector(ResultLCMTy.getElementCount(), OverflowElementTy);
4007 
4008   // Recombine the pieces to the original result and overflow registers.
4009   buildWidenedRemergeToDst(Result, ResultLCMTy, ResultParts);
4010   buildWidenedRemergeToDst(Overflow, OverflowLCMTy, OverflowParts);
4011   MI.eraseFromParent();
4012   return Legalized;
4013 }
4014 
4015 // Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces
4016 // a vector
4017 //
4018 // Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with
4019 // undef as necessary.
4020 //
4021 // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
4022 //   -> <2 x s16>
4023 //
4024 // %4:_(s16) = G_IMPLICIT_DEF
4025 // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
4026 // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
4027 // %7:_(<2 x s16>) = G_IMPLICIT_DEF
4028 // %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7
4029 // %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8
4030 LegalizerHelper::LegalizeResult
4031 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
4032                                           LLT NarrowTy) {
4033   Register DstReg = MI.getOperand(0).getReg();
4034   LLT DstTy = MRI.getType(DstReg);
4035   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
4036   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
4037 
4038   // Break into a common type
4039   SmallVector<Register, 16> Parts;
4040   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
4041     extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg());
4042 
4043   // Build the requested new merge, padding with undef.
4044   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts,
4045                                   TargetOpcode::G_ANYEXT);
4046 
4047   // Pack into the original result register.
4048   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4049 
4050   MI.eraseFromParent();
4051   return Legalized;
4052 }
4053 
4054 LegalizerHelper::LegalizeResult
4055 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
4056                                                            unsigned TypeIdx,
4057                                                            LLT NarrowVecTy) {
4058   Register DstReg = MI.getOperand(0).getReg();
4059   Register SrcVec = MI.getOperand(1).getReg();
4060   Register InsertVal;
4061   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
4062 
4063   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
4064   if (IsInsert)
4065     InsertVal = MI.getOperand(2).getReg();
4066 
4067   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
4068 
4069   // TODO: Handle total scalarization case.
4070   if (!NarrowVecTy.isVector())
4071     return UnableToLegalize;
4072 
4073   LLT VecTy = MRI.getType(SrcVec);
4074 
4075   // If the index is a constant, we can really break this down as you would
4076   // expect, and index into the target size pieces.
4077   int64_t IdxVal;
4078   auto MaybeCst =
4079       getConstantVRegValWithLookThrough(Idx, MRI, /*LookThroughInstrs*/ true,
4080                                         /*HandleFConstants*/ false);
4081   if (MaybeCst) {
4082     IdxVal = MaybeCst->Value.getSExtValue();
4083     // Avoid out of bounds indexing the pieces.
4084     if (IdxVal >= VecTy.getNumElements()) {
4085       MIRBuilder.buildUndef(DstReg);
4086       MI.eraseFromParent();
4087       return Legalized;
4088     }
4089 
4090     SmallVector<Register, 8> VecParts;
4091     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4092 
4093     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4094     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4095                                     TargetOpcode::G_ANYEXT);
4096 
4097     unsigned NewNumElts = NarrowVecTy.getNumElements();
4098 
4099     LLT IdxTy = MRI.getType(Idx);
4100     int64_t PartIdx = IdxVal / NewNumElts;
4101     auto NewIdx =
4102         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4103 
4104     if (IsInsert) {
4105       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4106 
4107       // Use the adjusted index to insert into one of the subvectors.
4108       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4109           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4110       VecParts[PartIdx] = InsertPart.getReg(0);
4111 
4112       // Recombine the inserted subvector with the others to reform the result
4113       // vector.
4114       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4115     } else {
4116       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4117     }
4118 
4119     MI.eraseFromParent();
4120     return Legalized;
4121   }
4122 
4123   // With a variable index, we can't perform the operation in a smaller type, so
4124   // we're forced to expand this.
4125   //
4126   // TODO: We could emit a chain of compare/select to figure out which piece to
4127   // index.
4128   return lowerExtractInsertVectorElt(MI);
4129 }
4130 
4131 LegalizerHelper::LegalizeResult
4132 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4133                                       LLT NarrowTy) {
4134   // FIXME: Don't know how to handle secondary types yet.
4135   if (TypeIdx != 0)
4136     return UnableToLegalize;
4137 
4138   // This implementation doesn't work for atomics. Give up instead of doing
4139   // something invalid.
4140   if (LdStMI.isAtomic())
4141     return UnableToLegalize;
4142 
4143   bool IsLoad = isa<GLoad>(LdStMI);
4144   Register ValReg = LdStMI.getReg(0);
4145   Register AddrReg = LdStMI.getPointerReg();
4146   LLT ValTy = MRI.getType(ValReg);
4147 
4148   // FIXME: Do we need a distinct NarrowMemory legalize action?
4149   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4150     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4151     return UnableToLegalize;
4152   }
4153 
4154   int NumParts = -1;
4155   int NumLeftover = -1;
4156   LLT LeftoverTy;
4157   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4158   if (IsLoad) {
4159     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4160   } else {
4161     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4162                      NarrowLeftoverRegs)) {
4163       NumParts = NarrowRegs.size();
4164       NumLeftover = NarrowLeftoverRegs.size();
4165     }
4166   }
4167 
4168   if (NumParts == -1)
4169     return UnableToLegalize;
4170 
4171   LLT PtrTy = MRI.getType(AddrReg);
4172   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4173 
4174   unsigned TotalSize = ValTy.getSizeInBits();
4175 
4176   // Split the load/store into PartTy sized pieces starting at Offset. If this
4177   // is a load, return the new registers in ValRegs. For a store, each elements
4178   // of ValRegs should be PartTy. Returns the next offset that needs to be
4179   // handled.
4180   auto MMO = LdStMI.getMMO();
4181   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4182                              unsigned Offset) -> unsigned {
4183     MachineFunction &MF = MIRBuilder.getMF();
4184     unsigned PartSize = PartTy.getSizeInBits();
4185     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4186          Offset += PartSize, ++Idx) {
4187       unsigned ByteOffset = Offset / 8;
4188       Register NewAddrReg;
4189 
4190       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4191 
4192       MachineMemOperand *NewMMO =
4193           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4194 
4195       if (IsLoad) {
4196         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4197         ValRegs.push_back(Dst);
4198         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4199       } else {
4200         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4201       }
4202     }
4203 
4204     return Offset;
4205   };
4206 
4207   unsigned HandledOffset = splitTypePieces(NarrowTy, NarrowRegs, 0);
4208 
4209   // Handle the rest of the register if this isn't an even type breakdown.
4210   if (LeftoverTy.isValid())
4211     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, HandledOffset);
4212 
4213   if (IsLoad) {
4214     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4215                 LeftoverTy, NarrowLeftoverRegs);
4216   }
4217 
4218   LdStMI.eraseFromParent();
4219   return Legalized;
4220 }
4221 
4222 LegalizerHelper::LegalizeResult
4223 LegalizerHelper::reduceOperationWidth(MachineInstr &MI, unsigned int TypeIdx,
4224                                       LLT NarrowTy) {
4225   assert(TypeIdx == 0 && "only one type index expected");
4226 
4227   const unsigned Opc = MI.getOpcode();
4228   const int NumDefOps = MI.getNumExplicitDefs();
4229   const int NumSrcOps = MI.getNumOperands() - NumDefOps;
4230   const unsigned Flags = MI.getFlags();
4231   const unsigned NarrowSize = NarrowTy.getSizeInBits();
4232   const LLT NarrowScalarTy = LLT::scalar(NarrowSize);
4233 
4234   assert(MI.getNumOperands() <= 4 && "expected instruction with either 1 "
4235                                      "result and 1-3 sources or 2 results and "
4236                                      "1-2 sources");
4237 
4238   SmallVector<Register, 2> DstRegs;
4239   for (int I = 0; I < NumDefOps; ++I)
4240     DstRegs.push_back(MI.getOperand(I).getReg());
4241 
4242   // First of all check whether we are narrowing (changing the element type)
4243   // or reducing the vector elements
4244   const LLT DstTy = MRI.getType(DstRegs[0]);
4245   const bool IsNarrow = NarrowTy.getScalarType() != DstTy.getScalarType();
4246 
4247   SmallVector<Register, 8> ExtractedRegs[3];
4248   SmallVector<Register, 8> Parts;
4249 
4250   // Break down all the sources into NarrowTy pieces we can operate on. This may
4251   // involve creating merges to a wider type, padded with undef.
4252   for (int I = 0; I != NumSrcOps; ++I) {
4253     Register SrcReg = MI.getOperand(I + NumDefOps).getReg();
4254     LLT SrcTy = MRI.getType(SrcReg);
4255 
4256     // The type to narrow SrcReg to. For narrowing, this is a smaller scalar.
4257     // For fewerElements, this is a smaller vector with the same element type.
4258     LLT OpNarrowTy;
4259     if (IsNarrow) {
4260       OpNarrowTy = NarrowScalarTy;
4261 
4262       // In case of narrowing, we need to cast vectors to scalars for this to
4263       // work properly
4264       // FIXME: Can we do without the bitcast here if we're narrowing?
4265       if (SrcTy.isVector()) {
4266         SrcTy = LLT::scalar(SrcTy.getSizeInBits());
4267         SrcReg = MIRBuilder.buildBitcast(SrcTy, SrcReg).getReg(0);
4268       }
4269     } else {
4270       auto NarrowEC = NarrowTy.isVector() ? NarrowTy.getElementCount()
4271                                           : ElementCount::getFixed(1);
4272       OpNarrowTy = LLT::scalarOrVector(NarrowEC, SrcTy.getScalarType());
4273     }
4274 
4275     LLT GCDTy = extractGCDType(ExtractedRegs[I], SrcTy, OpNarrowTy, SrcReg);
4276 
4277     // Build a sequence of NarrowTy pieces in ExtractedRegs for this operand.
4278     buildLCMMergePieces(SrcTy, OpNarrowTy, GCDTy, ExtractedRegs[I],
4279                         TargetOpcode::G_ANYEXT);
4280   }
4281 
4282   SmallVector<Register, 8> ResultRegs[2];
4283 
4284   // Input operands for each sub-instruction.
4285   SmallVector<SrcOp, 4> InputRegs(NumSrcOps, Register());
4286 
4287   int NumParts = ExtractedRegs[0].size();
4288   const unsigned DstSize = DstTy.getSizeInBits();
4289   const LLT DstScalarTy = LLT::scalar(DstSize);
4290 
4291   // Narrowing needs to use scalar types
4292   LLT DstLCMTy, NarrowDstTy;
4293   if (IsNarrow) {
4294     DstLCMTy = getLCMType(DstScalarTy, NarrowScalarTy);
4295     NarrowDstTy = NarrowScalarTy;
4296   } else {
4297     DstLCMTy = getLCMType(DstTy, NarrowTy);
4298     NarrowDstTy = NarrowTy;
4299   }
4300 
4301   // We widened the source registers to satisfy merge/unmerge size
4302   // constraints. We'll have some extra fully undef parts.
4303   const int NumRealParts = (DstSize + NarrowSize - 1) / NarrowSize;
4304 
4305   for (int I = 0; I != NumRealParts; ++I) {
4306     // Emit this instruction on each of the split pieces.
4307     for (int J = 0; J != NumSrcOps; ++J)
4308       InputRegs[J] = ExtractedRegs[J][I];
4309 
4310     MachineInstrBuilder Inst;
4311     if (NumDefOps == 1)
4312       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy}, InputRegs, Flags);
4313     else
4314       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy, NarrowDstTy}, InputRegs,
4315                                    Flags);
4316 
4317     for (int J = 0; J != NumDefOps; ++J)
4318       ResultRegs[J].push_back(Inst.getReg(J));
4319   }
4320 
4321   // Fill out the widened result with undef instead of creating instructions
4322   // with undef inputs.
4323   int NumUndefParts = NumParts - NumRealParts;
4324   if (NumUndefParts != 0) {
4325     Register Undef = MIRBuilder.buildUndef(NarrowDstTy).getReg(0);
4326     for (int I = 0; I != NumDefOps; ++I)
4327       ResultRegs[I].append(NumUndefParts, Undef);
4328   }
4329 
4330   // Extract the possibly padded result. Use a scratch register if we need to do
4331   // a final bitcast, otherwise use the original result register.
4332   Register MergeDstReg;
4333   for (int I = 0; I != NumDefOps; ++I) {
4334     if (IsNarrow && DstTy.isVector())
4335       MergeDstReg = MRI.createGenericVirtualRegister(DstScalarTy);
4336     else
4337       MergeDstReg = DstRegs[I];
4338 
4339     buildWidenedRemergeToDst(MergeDstReg, DstLCMTy, ResultRegs[I]);
4340 
4341     // Recast to vector if we narrowed a vector
4342     if (IsNarrow && DstTy.isVector())
4343       MIRBuilder.buildBitcast(DstRegs[I], MergeDstReg);
4344   }
4345 
4346   MI.eraseFromParent();
4347   return Legalized;
4348 }
4349 
4350 LegalizerHelper::LegalizeResult
4351 LegalizerHelper::fewerElementsVectorSextInReg(MachineInstr &MI, unsigned TypeIdx,
4352                                               LLT NarrowTy) {
4353   Register DstReg = MI.getOperand(0).getReg();
4354   Register SrcReg = MI.getOperand(1).getReg();
4355   int64_t Imm = MI.getOperand(2).getImm();
4356 
4357   LLT DstTy = MRI.getType(DstReg);
4358 
4359   SmallVector<Register, 8> Parts;
4360   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
4361   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts);
4362 
4363   for (Register &R : Parts)
4364     R = MIRBuilder.buildSExtInReg(NarrowTy, R, Imm).getReg(0);
4365 
4366   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4367 
4368   MI.eraseFromParent();
4369   return Legalized;
4370 }
4371 
4372 LegalizerHelper::LegalizeResult
4373 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4374                                      LLT NarrowTy) {
4375   using namespace TargetOpcode;
4376 
4377   switch (MI.getOpcode()) {
4378   case G_IMPLICIT_DEF:
4379     return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy);
4380   case G_TRUNC:
4381   case G_AND:
4382   case G_OR:
4383   case G_XOR:
4384   case G_ADD:
4385   case G_SUB:
4386   case G_MUL:
4387   case G_PTR_ADD:
4388   case G_SMULH:
4389   case G_UMULH:
4390   case G_FADD:
4391   case G_FMUL:
4392   case G_FSUB:
4393   case G_FNEG:
4394   case G_FABS:
4395   case G_FCANONICALIZE:
4396   case G_FDIV:
4397   case G_FREM:
4398   case G_FMA:
4399   case G_FMAD:
4400   case G_FPOW:
4401   case G_FEXP:
4402   case G_FEXP2:
4403   case G_FLOG:
4404   case G_FLOG2:
4405   case G_FLOG10:
4406   case G_FNEARBYINT:
4407   case G_FCEIL:
4408   case G_FFLOOR:
4409   case G_FRINT:
4410   case G_INTRINSIC_ROUND:
4411   case G_INTRINSIC_ROUNDEVEN:
4412   case G_INTRINSIC_TRUNC:
4413   case G_FCOS:
4414   case G_FSIN:
4415   case G_FSQRT:
4416   case G_BSWAP:
4417   case G_BITREVERSE:
4418   case G_SDIV:
4419   case G_UDIV:
4420   case G_SREM:
4421   case G_UREM:
4422   case G_SDIVREM:
4423   case G_UDIVREM:
4424   case G_SMIN:
4425   case G_SMAX:
4426   case G_UMIN:
4427   case G_UMAX:
4428   case G_ABS:
4429   case G_FMINNUM:
4430   case G_FMAXNUM:
4431   case G_FMINNUM_IEEE:
4432   case G_FMAXNUM_IEEE:
4433   case G_FMINIMUM:
4434   case G_FMAXIMUM:
4435   case G_FSHL:
4436   case G_FSHR:
4437   case G_ROTL:
4438   case G_ROTR:
4439   case G_FREEZE:
4440   case G_SADDSAT:
4441   case G_SSUBSAT:
4442   case G_UADDSAT:
4443   case G_USUBSAT:
4444     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
4445   case G_UMULO:
4446   case G_SMULO:
4447     return fewerElementsVectorMulo(MI, TypeIdx, NarrowTy);
4448   case G_SHL:
4449   case G_LSHR:
4450   case G_ASHR:
4451   case G_SSHLSAT:
4452   case G_USHLSAT:
4453   case G_CTLZ:
4454   case G_CTLZ_ZERO_UNDEF:
4455   case G_CTTZ:
4456   case G_CTTZ_ZERO_UNDEF:
4457   case G_CTPOP:
4458   case G_FCOPYSIGN:
4459     return fewerElementsVectorMultiEltType(MI, TypeIdx, NarrowTy);
4460   case G_ZEXT:
4461   case G_SEXT:
4462   case G_ANYEXT:
4463   case G_FPEXT:
4464   case G_FPTRUNC:
4465   case G_SITOFP:
4466   case G_UITOFP:
4467   case G_FPTOSI:
4468   case G_FPTOUI:
4469   case G_INTTOPTR:
4470   case G_PTRTOINT:
4471   case G_ADDRSPACE_CAST:
4472     return fewerElementsVectorCasts(MI, TypeIdx, NarrowTy);
4473   case G_ICMP:
4474   case G_FCMP:
4475     return fewerElementsVectorCmp(MI, TypeIdx, NarrowTy);
4476   case G_SELECT:
4477     return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy);
4478   case G_PHI:
4479     return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy);
4480   case G_UNMERGE_VALUES:
4481     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4482   case G_BUILD_VECTOR:
4483     assert(TypeIdx == 0 && "not a vector type index");
4484     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4485   case G_CONCAT_VECTORS:
4486     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4487       return UnableToLegalize;
4488     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4489   case G_EXTRACT_VECTOR_ELT:
4490   case G_INSERT_VECTOR_ELT:
4491     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4492   case G_LOAD:
4493   case G_STORE:
4494     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4495   case G_SEXT_INREG:
4496     return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy);
4497   GISEL_VECREDUCE_CASES_NONSEQ
4498     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4499   case G_SHUFFLE_VECTOR:
4500     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4501   default:
4502     return UnableToLegalize;
4503   }
4504 }
4505 
4506 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4507     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4508   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4509   if (TypeIdx != 0)
4510     return UnableToLegalize;
4511 
4512   Register DstReg = MI.getOperand(0).getReg();
4513   Register Src1Reg = MI.getOperand(1).getReg();
4514   Register Src2Reg = MI.getOperand(2).getReg();
4515   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4516   LLT DstTy = MRI.getType(DstReg);
4517   LLT Src1Ty = MRI.getType(Src1Reg);
4518   LLT Src2Ty = MRI.getType(Src2Reg);
4519   // The shuffle should be canonicalized by now.
4520   if (DstTy != Src1Ty)
4521     return UnableToLegalize;
4522   if (DstTy != Src2Ty)
4523     return UnableToLegalize;
4524 
4525   if (!isPowerOf2_32(DstTy.getNumElements()))
4526     return UnableToLegalize;
4527 
4528   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4529   // Further legalization attempts will be needed to do split further.
4530   NarrowTy =
4531       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4532   unsigned NewElts = NarrowTy.getNumElements();
4533 
4534   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4535   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4536   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4537   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4538                         SplitSrc2Regs[1]};
4539 
4540   Register Hi, Lo;
4541 
4542   // If Lo or Hi uses elements from at most two of the four input vectors, then
4543   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4544   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4545   SmallVector<int, 16> Ops;
4546   for (unsigned High = 0; High < 2; ++High) {
4547     Register &Output = High ? Hi : Lo;
4548 
4549     // Build a shuffle mask for the output, discovering on the fly which
4550     // input vectors to use as shuffle operands (recorded in InputUsed).
4551     // If building a suitable shuffle vector proves too hard, then bail
4552     // out with useBuildVector set.
4553     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4554     unsigned FirstMaskIdx = High * NewElts;
4555     bool UseBuildVector = false;
4556     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4557       // The mask element.  This indexes into the input.
4558       int Idx = Mask[FirstMaskIdx + MaskOffset];
4559 
4560       // The input vector this mask element indexes into.
4561       unsigned Input = (unsigned)Idx / NewElts;
4562 
4563       if (Input >= array_lengthof(Inputs)) {
4564         // The mask element does not index into any input vector.
4565         Ops.push_back(-1);
4566         continue;
4567       }
4568 
4569       // Turn the index into an offset from the start of the input vector.
4570       Idx -= Input * NewElts;
4571 
4572       // Find or create a shuffle vector operand to hold this input.
4573       unsigned OpNo;
4574       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4575         if (InputUsed[OpNo] == Input) {
4576           // This input vector is already an operand.
4577           break;
4578         } else if (InputUsed[OpNo] == -1U) {
4579           // Create a new operand for this input vector.
4580           InputUsed[OpNo] = Input;
4581           break;
4582         }
4583       }
4584 
4585       if (OpNo >= array_lengthof(InputUsed)) {
4586         // More than two input vectors used!  Give up on trying to create a
4587         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4588         UseBuildVector = true;
4589         break;
4590       }
4591 
4592       // Add the mask index for the new shuffle vector.
4593       Ops.push_back(Idx + OpNo * NewElts);
4594     }
4595 
4596     if (UseBuildVector) {
4597       LLT EltTy = NarrowTy.getElementType();
4598       SmallVector<Register, 16> SVOps;
4599 
4600       // Extract the input elements by hand.
4601       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4602         // The mask element.  This indexes into the input.
4603         int Idx = Mask[FirstMaskIdx + MaskOffset];
4604 
4605         // The input vector this mask element indexes into.
4606         unsigned Input = (unsigned)Idx / NewElts;
4607 
4608         if (Input >= array_lengthof(Inputs)) {
4609           // The mask element is "undef" or indexes off the end of the input.
4610           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4611           continue;
4612         }
4613 
4614         // Turn the index into an offset from the start of the input vector.
4615         Idx -= Input * NewElts;
4616 
4617         // Extract the vector element by hand.
4618         SVOps.push_back(MIRBuilder
4619                             .buildExtractVectorElement(
4620                                 EltTy, Inputs[Input],
4621                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4622                             .getReg(0));
4623       }
4624 
4625       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4626       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4627     } else if (InputUsed[0] == -1U) {
4628       // No input vectors were used! The result is undefined.
4629       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4630     } else {
4631       Register Op0 = Inputs[InputUsed[0]];
4632       // If only one input was used, use an undefined vector for the other.
4633       Register Op1 = InputUsed[1] == -1U
4634                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4635                          : Inputs[InputUsed[1]];
4636       // At least one input vector was used. Create a new shuffle vector.
4637       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4638     }
4639 
4640     Ops.clear();
4641   }
4642 
4643   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4644   MI.eraseFromParent();
4645   return Legalized;
4646 }
4647 
4648 static unsigned getScalarOpcForReduction(unsigned Opc) {
4649   unsigned ScalarOpc;
4650   switch (Opc) {
4651   case TargetOpcode::G_VECREDUCE_FADD:
4652     ScalarOpc = TargetOpcode::G_FADD;
4653     break;
4654   case TargetOpcode::G_VECREDUCE_FMUL:
4655     ScalarOpc = TargetOpcode::G_FMUL;
4656     break;
4657   case TargetOpcode::G_VECREDUCE_FMAX:
4658     ScalarOpc = TargetOpcode::G_FMAXNUM;
4659     break;
4660   case TargetOpcode::G_VECREDUCE_FMIN:
4661     ScalarOpc = TargetOpcode::G_FMINNUM;
4662     break;
4663   case TargetOpcode::G_VECREDUCE_ADD:
4664     ScalarOpc = TargetOpcode::G_ADD;
4665     break;
4666   case TargetOpcode::G_VECREDUCE_MUL:
4667     ScalarOpc = TargetOpcode::G_MUL;
4668     break;
4669   case TargetOpcode::G_VECREDUCE_AND:
4670     ScalarOpc = TargetOpcode::G_AND;
4671     break;
4672   case TargetOpcode::G_VECREDUCE_OR:
4673     ScalarOpc = TargetOpcode::G_OR;
4674     break;
4675   case TargetOpcode::G_VECREDUCE_XOR:
4676     ScalarOpc = TargetOpcode::G_XOR;
4677     break;
4678   case TargetOpcode::G_VECREDUCE_SMAX:
4679     ScalarOpc = TargetOpcode::G_SMAX;
4680     break;
4681   case TargetOpcode::G_VECREDUCE_SMIN:
4682     ScalarOpc = TargetOpcode::G_SMIN;
4683     break;
4684   case TargetOpcode::G_VECREDUCE_UMAX:
4685     ScalarOpc = TargetOpcode::G_UMAX;
4686     break;
4687   case TargetOpcode::G_VECREDUCE_UMIN:
4688     ScalarOpc = TargetOpcode::G_UMIN;
4689     break;
4690   default:
4691     llvm_unreachable("Unhandled reduction");
4692   }
4693   return ScalarOpc;
4694 }
4695 
4696 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4697     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4698   unsigned Opc = MI.getOpcode();
4699   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4700          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4701          "Sequential reductions not expected");
4702 
4703   if (TypeIdx != 1)
4704     return UnableToLegalize;
4705 
4706   // The semantics of the normal non-sequential reductions allow us to freely
4707   // re-associate the operation.
4708   Register SrcReg = MI.getOperand(1).getReg();
4709   LLT SrcTy = MRI.getType(SrcReg);
4710   Register DstReg = MI.getOperand(0).getReg();
4711   LLT DstTy = MRI.getType(DstReg);
4712 
4713   if (NarrowTy.isVector() &&
4714       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4715     return UnableToLegalize;
4716 
4717   unsigned ScalarOpc = getScalarOpcForReduction(Opc);
4718   SmallVector<Register> SplitSrcs;
4719   // If NarrowTy is a scalar then we're being asked to scalarize.
4720   const unsigned NumParts =
4721       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4722                           : SrcTy.getNumElements();
4723 
4724   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4725   if (NarrowTy.isScalar()) {
4726     if (DstTy != NarrowTy)
4727       return UnableToLegalize; // FIXME: handle implicit extensions.
4728 
4729     if (isPowerOf2_32(NumParts)) {
4730       // Generate a tree of scalar operations to reduce the critical path.
4731       SmallVector<Register> PartialResults;
4732       unsigned NumPartsLeft = NumParts;
4733       while (NumPartsLeft > 1) {
4734         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4735           PartialResults.emplace_back(
4736               MIRBuilder
4737                   .buildInstr(ScalarOpc, {NarrowTy},
4738                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4739                   .getReg(0));
4740         }
4741         SplitSrcs = PartialResults;
4742         PartialResults.clear();
4743         NumPartsLeft = SplitSrcs.size();
4744       }
4745       assert(SplitSrcs.size() == 1);
4746       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4747       MI.eraseFromParent();
4748       return Legalized;
4749     }
4750     // If we can't generate a tree, then just do sequential operations.
4751     Register Acc = SplitSrcs[0];
4752     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4753       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4754                 .getReg(0);
4755     MIRBuilder.buildCopy(DstReg, Acc);
4756     MI.eraseFromParent();
4757     return Legalized;
4758   }
4759   SmallVector<Register> PartialReductions;
4760   for (unsigned Part = 0; Part < NumParts; ++Part) {
4761     PartialReductions.push_back(
4762         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4763   }
4764 
4765 
4766   // If the types involved are powers of 2, we can generate intermediate vector
4767   // ops, before generating a final reduction operation.
4768   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4769       isPowerOf2_32(NarrowTy.getNumElements())) {
4770     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4771   }
4772 
4773   Register Acc = PartialReductions[0];
4774   for (unsigned Part = 1; Part < NumParts; ++Part) {
4775     if (Part == NumParts - 1) {
4776       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4777                             {Acc, PartialReductions[Part]});
4778     } else {
4779       Acc = MIRBuilder
4780                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4781                 .getReg(0);
4782     }
4783   }
4784   MI.eraseFromParent();
4785   return Legalized;
4786 }
4787 
4788 LegalizerHelper::LegalizeResult
4789 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4790                                         LLT SrcTy, LLT NarrowTy,
4791                                         unsigned ScalarOpc) {
4792   SmallVector<Register> SplitSrcs;
4793   // Split the sources into NarrowTy size pieces.
4794   extractParts(SrcReg, NarrowTy,
4795                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4796   // We're going to do a tree reduction using vector operations until we have
4797   // one NarrowTy size value left.
4798   while (SplitSrcs.size() > 1) {
4799     SmallVector<Register> PartialRdxs;
4800     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4801       Register LHS = SplitSrcs[Idx];
4802       Register RHS = SplitSrcs[Idx + 1];
4803       // Create the intermediate vector op.
4804       Register Res =
4805           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4806       PartialRdxs.push_back(Res);
4807     }
4808     SplitSrcs = std::move(PartialRdxs);
4809   }
4810   // Finally generate the requested NarrowTy based reduction.
4811   Observer.changingInstr(MI);
4812   MI.getOperand(1).setReg(SplitSrcs[0]);
4813   Observer.changedInstr(MI);
4814   return Legalized;
4815 }
4816 
4817 LegalizerHelper::LegalizeResult
4818 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4819                                              const LLT HalfTy, const LLT AmtTy) {
4820 
4821   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4822   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4823   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4824 
4825   if (Amt.isNullValue()) {
4826     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4827     MI.eraseFromParent();
4828     return Legalized;
4829   }
4830 
4831   LLT NVT = HalfTy;
4832   unsigned NVTBits = HalfTy.getSizeInBits();
4833   unsigned VTBits = 2 * NVTBits;
4834 
4835   SrcOp Lo(Register(0)), Hi(Register(0));
4836   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4837     if (Amt.ugt(VTBits)) {
4838       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4839     } else if (Amt.ugt(NVTBits)) {
4840       Lo = MIRBuilder.buildConstant(NVT, 0);
4841       Hi = MIRBuilder.buildShl(NVT, InL,
4842                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4843     } else if (Amt == NVTBits) {
4844       Lo = MIRBuilder.buildConstant(NVT, 0);
4845       Hi = InL;
4846     } else {
4847       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4848       auto OrLHS =
4849           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4850       auto OrRHS = MIRBuilder.buildLShr(
4851           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4852       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4853     }
4854   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4855     if (Amt.ugt(VTBits)) {
4856       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4857     } else if (Amt.ugt(NVTBits)) {
4858       Lo = MIRBuilder.buildLShr(NVT, InH,
4859                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4860       Hi = MIRBuilder.buildConstant(NVT, 0);
4861     } else if (Amt == NVTBits) {
4862       Lo = InH;
4863       Hi = MIRBuilder.buildConstant(NVT, 0);
4864     } else {
4865       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4866 
4867       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4868       auto OrRHS = MIRBuilder.buildShl(
4869           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4870 
4871       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4872       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4873     }
4874   } else {
4875     if (Amt.ugt(VTBits)) {
4876       Hi = Lo = MIRBuilder.buildAShr(
4877           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4878     } else if (Amt.ugt(NVTBits)) {
4879       Lo = MIRBuilder.buildAShr(NVT, InH,
4880                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4881       Hi = MIRBuilder.buildAShr(NVT, InH,
4882                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4883     } else if (Amt == NVTBits) {
4884       Lo = InH;
4885       Hi = MIRBuilder.buildAShr(NVT, InH,
4886                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4887     } else {
4888       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4889 
4890       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4891       auto OrRHS = MIRBuilder.buildShl(
4892           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4893 
4894       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4895       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4896     }
4897   }
4898 
4899   MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4900   MI.eraseFromParent();
4901 
4902   return Legalized;
4903 }
4904 
4905 // TODO: Optimize if constant shift amount.
4906 LegalizerHelper::LegalizeResult
4907 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4908                                    LLT RequestedTy) {
4909   if (TypeIdx == 1) {
4910     Observer.changingInstr(MI);
4911     narrowScalarSrc(MI, RequestedTy, 2);
4912     Observer.changedInstr(MI);
4913     return Legalized;
4914   }
4915 
4916   Register DstReg = MI.getOperand(0).getReg();
4917   LLT DstTy = MRI.getType(DstReg);
4918   if (DstTy.isVector())
4919     return UnableToLegalize;
4920 
4921   Register Amt = MI.getOperand(2).getReg();
4922   LLT ShiftAmtTy = MRI.getType(Amt);
4923   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4924   if (DstEltSize % 2 != 0)
4925     return UnableToLegalize;
4926 
4927   // Ignore the input type. We can only go to exactly half the size of the
4928   // input. If that isn't small enough, the resulting pieces will be further
4929   // legalized.
4930   const unsigned NewBitSize = DstEltSize / 2;
4931   const LLT HalfTy = LLT::scalar(NewBitSize);
4932   const LLT CondTy = LLT::scalar(1);
4933 
4934   if (auto VRegAndVal =
4935           getConstantVRegValWithLookThrough(Amt, MRI, true, false)) {
4936     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4937                                        ShiftAmtTy);
4938   }
4939 
4940   // TODO: Expand with known bits.
4941 
4942   // Handle the fully general expansion by an unknown amount.
4943   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4944 
4945   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4946   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4947   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4948 
4949   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4950   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4951 
4952   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4953   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4954   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4955 
4956   Register ResultRegs[2];
4957   switch (MI.getOpcode()) {
4958   case TargetOpcode::G_SHL: {
4959     // Short: ShAmt < NewBitSize
4960     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4961 
4962     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4963     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4964     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4965 
4966     // Long: ShAmt >= NewBitSize
4967     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4968     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4969 
4970     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4971     auto Hi = MIRBuilder.buildSelect(
4972         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4973 
4974     ResultRegs[0] = Lo.getReg(0);
4975     ResultRegs[1] = Hi.getReg(0);
4976     break;
4977   }
4978   case TargetOpcode::G_LSHR:
4979   case TargetOpcode::G_ASHR: {
4980     // Short: ShAmt < NewBitSize
4981     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4982 
4983     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4984     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4985     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4986 
4987     // Long: ShAmt >= NewBitSize
4988     MachineInstrBuilder HiL;
4989     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4990       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4991     } else {
4992       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4993       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4994     }
4995     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4996                                      {InH, AmtExcess});     // Lo from Hi part.
4997 
4998     auto Lo = MIRBuilder.buildSelect(
4999         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
5000 
5001     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
5002 
5003     ResultRegs[0] = Lo.getReg(0);
5004     ResultRegs[1] = Hi.getReg(0);
5005     break;
5006   }
5007   default:
5008     llvm_unreachable("not a shift");
5009   }
5010 
5011   MIRBuilder.buildMerge(DstReg, ResultRegs);
5012   MI.eraseFromParent();
5013   return Legalized;
5014 }
5015 
5016 LegalizerHelper::LegalizeResult
5017 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
5018                                        LLT MoreTy) {
5019   assert(TypeIdx == 0 && "Expecting only Idx 0");
5020 
5021   Observer.changingInstr(MI);
5022   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5023     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
5024     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
5025     moreElementsVectorSrc(MI, MoreTy, I);
5026   }
5027 
5028   MachineBasicBlock &MBB = *MI.getParent();
5029   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
5030   moreElementsVectorDst(MI, MoreTy, 0);
5031   Observer.changedInstr(MI);
5032   return Legalized;
5033 }
5034 
5035 LegalizerHelper::LegalizeResult
5036 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
5037                                     LLT MoreTy) {
5038   unsigned Opc = MI.getOpcode();
5039   switch (Opc) {
5040   case TargetOpcode::G_IMPLICIT_DEF:
5041   case TargetOpcode::G_LOAD: {
5042     if (TypeIdx != 0)
5043       return UnableToLegalize;
5044     Observer.changingInstr(MI);
5045     moreElementsVectorDst(MI, MoreTy, 0);
5046     Observer.changedInstr(MI);
5047     return Legalized;
5048   }
5049   case TargetOpcode::G_STORE:
5050     if (TypeIdx != 0)
5051       return UnableToLegalize;
5052     Observer.changingInstr(MI);
5053     moreElementsVectorSrc(MI, MoreTy, 0);
5054     Observer.changedInstr(MI);
5055     return Legalized;
5056   case TargetOpcode::G_AND:
5057   case TargetOpcode::G_OR:
5058   case TargetOpcode::G_XOR:
5059   case TargetOpcode::G_SMIN:
5060   case TargetOpcode::G_SMAX:
5061   case TargetOpcode::G_UMIN:
5062   case TargetOpcode::G_UMAX:
5063   case TargetOpcode::G_FMINNUM:
5064   case TargetOpcode::G_FMAXNUM:
5065   case TargetOpcode::G_FMINNUM_IEEE:
5066   case TargetOpcode::G_FMAXNUM_IEEE:
5067   case TargetOpcode::G_FMINIMUM:
5068   case TargetOpcode::G_FMAXIMUM: {
5069     Observer.changingInstr(MI);
5070     moreElementsVectorSrc(MI, MoreTy, 1);
5071     moreElementsVectorSrc(MI, MoreTy, 2);
5072     moreElementsVectorDst(MI, MoreTy, 0);
5073     Observer.changedInstr(MI);
5074     return Legalized;
5075   }
5076   case TargetOpcode::G_EXTRACT:
5077     if (TypeIdx != 1)
5078       return UnableToLegalize;
5079     Observer.changingInstr(MI);
5080     moreElementsVectorSrc(MI, MoreTy, 1);
5081     Observer.changedInstr(MI);
5082     return Legalized;
5083   case TargetOpcode::G_INSERT:
5084   case TargetOpcode::G_FREEZE:
5085     if (TypeIdx != 0)
5086       return UnableToLegalize;
5087     Observer.changingInstr(MI);
5088     moreElementsVectorSrc(MI, MoreTy, 1);
5089     moreElementsVectorDst(MI, MoreTy, 0);
5090     Observer.changedInstr(MI);
5091     return Legalized;
5092   case TargetOpcode::G_SELECT:
5093     if (TypeIdx != 0)
5094       return UnableToLegalize;
5095     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
5096       return UnableToLegalize;
5097 
5098     Observer.changingInstr(MI);
5099     moreElementsVectorSrc(MI, MoreTy, 2);
5100     moreElementsVectorSrc(MI, MoreTy, 3);
5101     moreElementsVectorDst(MI, MoreTy, 0);
5102     Observer.changedInstr(MI);
5103     return Legalized;
5104   case TargetOpcode::G_UNMERGE_VALUES: {
5105     if (TypeIdx != 1)
5106       return UnableToLegalize;
5107 
5108     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
5109     int NumDst = MI.getNumOperands() - 1;
5110     moreElementsVectorSrc(MI, MoreTy, NumDst);
5111 
5112     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
5113     for (int I = 0; I != NumDst; ++I)
5114       MIB.addDef(MI.getOperand(I).getReg());
5115 
5116     int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits();
5117     for (int I = NumDst; I != NewNumDst; ++I)
5118       MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
5119 
5120     MIB.addUse(MI.getOperand(NumDst).getReg());
5121     MI.eraseFromParent();
5122     return Legalized;
5123   }
5124   case TargetOpcode::G_PHI:
5125     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
5126   case TargetOpcode::G_SHUFFLE_VECTOR:
5127     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
5128   default:
5129     return UnableToLegalize;
5130   }
5131 }
5132 
5133 LegalizerHelper::LegalizeResult
5134 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
5135                                            unsigned int TypeIdx, LLT MoreTy) {
5136   if (TypeIdx != 0)
5137     return UnableToLegalize;
5138 
5139   Register DstReg = MI.getOperand(0).getReg();
5140   Register Src1Reg = MI.getOperand(1).getReg();
5141   Register Src2Reg = MI.getOperand(2).getReg();
5142   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5143   LLT DstTy = MRI.getType(DstReg);
5144   LLT Src1Ty = MRI.getType(Src1Reg);
5145   LLT Src2Ty = MRI.getType(Src2Reg);
5146   unsigned NumElts = DstTy.getNumElements();
5147   unsigned WidenNumElts = MoreTy.getNumElements();
5148 
5149   // Expect a canonicalized shuffle.
5150   if (DstTy != Src1Ty || DstTy != Src2Ty)
5151     return UnableToLegalize;
5152 
5153   moreElementsVectorSrc(MI, MoreTy, 1);
5154   moreElementsVectorSrc(MI, MoreTy, 2);
5155 
5156   // Adjust mask based on new input vector length.
5157   SmallVector<int, 16> NewMask;
5158   for (unsigned I = 0; I != NumElts; ++I) {
5159     int Idx = Mask[I];
5160     if (Idx < static_cast<int>(NumElts))
5161       NewMask.push_back(Idx);
5162     else
5163       NewMask.push_back(Idx - NumElts + WidenNumElts);
5164   }
5165   for (unsigned I = NumElts; I != WidenNumElts; ++I)
5166     NewMask.push_back(-1);
5167   moreElementsVectorDst(MI, MoreTy, 0);
5168   MIRBuilder.setInstrAndDebugLoc(MI);
5169   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5170                                 MI.getOperand(1).getReg(),
5171                                 MI.getOperand(2).getReg(), NewMask);
5172   MI.eraseFromParent();
5173   return Legalized;
5174 }
5175 
5176 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
5177                                         ArrayRef<Register> Src1Regs,
5178                                         ArrayRef<Register> Src2Regs,
5179                                         LLT NarrowTy) {
5180   MachineIRBuilder &B = MIRBuilder;
5181   unsigned SrcParts = Src1Regs.size();
5182   unsigned DstParts = DstRegs.size();
5183 
5184   unsigned DstIdx = 0; // Low bits of the result.
5185   Register FactorSum =
5186       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
5187   DstRegs[DstIdx] = FactorSum;
5188 
5189   unsigned CarrySumPrevDstIdx;
5190   SmallVector<Register, 4> Factors;
5191 
5192   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
5193     // Collect low parts of muls for DstIdx.
5194     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5195          i <= std::min(DstIdx, SrcParts - 1); ++i) {
5196       MachineInstrBuilder Mul =
5197           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
5198       Factors.push_back(Mul.getReg(0));
5199     }
5200     // Collect high parts of muls from previous DstIdx.
5201     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5202          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5203       MachineInstrBuilder Umulh =
5204           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5205       Factors.push_back(Umulh.getReg(0));
5206     }
5207     // Add CarrySum from additions calculated for previous DstIdx.
5208     if (DstIdx != 1) {
5209       Factors.push_back(CarrySumPrevDstIdx);
5210     }
5211 
5212     Register CarrySum;
5213     // Add all factors and accumulate all carries into CarrySum.
5214     if (DstIdx != DstParts - 1) {
5215       MachineInstrBuilder Uaddo =
5216           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5217       FactorSum = Uaddo.getReg(0);
5218       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5219       for (unsigned i = 2; i < Factors.size(); ++i) {
5220         MachineInstrBuilder Uaddo =
5221             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5222         FactorSum = Uaddo.getReg(0);
5223         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5224         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5225       }
5226     } else {
5227       // Since value for the next index is not calculated, neither is CarrySum.
5228       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5229       for (unsigned i = 2; i < Factors.size(); ++i)
5230         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5231     }
5232 
5233     CarrySumPrevDstIdx = CarrySum;
5234     DstRegs[DstIdx] = FactorSum;
5235     Factors.clear();
5236   }
5237 }
5238 
5239 LegalizerHelper::LegalizeResult
5240 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5241                                     LLT NarrowTy) {
5242   if (TypeIdx != 0)
5243     return UnableToLegalize;
5244 
5245   Register DstReg = MI.getOperand(0).getReg();
5246   LLT DstType = MRI.getType(DstReg);
5247   // FIXME: add support for vector types
5248   if (DstType.isVector())
5249     return UnableToLegalize;
5250 
5251   unsigned Opcode = MI.getOpcode();
5252   unsigned OpO, OpE, OpF;
5253   switch (Opcode) {
5254   case TargetOpcode::G_SADDO:
5255   case TargetOpcode::G_SADDE:
5256   case TargetOpcode::G_UADDO:
5257   case TargetOpcode::G_UADDE:
5258   case TargetOpcode::G_ADD:
5259     OpO = TargetOpcode::G_UADDO;
5260     OpE = TargetOpcode::G_UADDE;
5261     OpF = TargetOpcode::G_UADDE;
5262     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5263       OpF = TargetOpcode::G_SADDE;
5264     break;
5265   case TargetOpcode::G_SSUBO:
5266   case TargetOpcode::G_SSUBE:
5267   case TargetOpcode::G_USUBO:
5268   case TargetOpcode::G_USUBE:
5269   case TargetOpcode::G_SUB:
5270     OpO = TargetOpcode::G_USUBO;
5271     OpE = TargetOpcode::G_USUBE;
5272     OpF = TargetOpcode::G_USUBE;
5273     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5274       OpF = TargetOpcode::G_SSUBE;
5275     break;
5276   default:
5277     llvm_unreachable("Unexpected add/sub opcode!");
5278   }
5279 
5280   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5281   unsigned NumDefs = MI.getNumExplicitDefs();
5282   Register Src1 = MI.getOperand(NumDefs).getReg();
5283   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5284   Register CarryDst, CarryIn;
5285   if (NumDefs == 2)
5286     CarryDst = MI.getOperand(1).getReg();
5287   if (MI.getNumOperands() == NumDefs + 3)
5288     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5289 
5290   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5291   LLT LeftoverTy, DummyTy;
5292   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5293   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5294   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5295 
5296   int NarrowParts = Src1Regs.size();
5297   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5298     Src1Regs.push_back(Src1Left[I]);
5299     Src2Regs.push_back(Src2Left[I]);
5300   }
5301   DstRegs.reserve(Src1Regs.size());
5302 
5303   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5304     Register DstReg =
5305         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5306     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5307     // Forward the final carry-out to the destination register
5308     if (i == e - 1 && CarryDst)
5309       CarryOut = CarryDst;
5310 
5311     if (!CarryIn) {
5312       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5313                             {Src1Regs[i], Src2Regs[i]});
5314     } else if (i == e - 1) {
5315       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5316                             {Src1Regs[i], Src2Regs[i], CarryIn});
5317     } else {
5318       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5319                             {Src1Regs[i], Src2Regs[i], CarryIn});
5320     }
5321 
5322     DstRegs.push_back(DstReg);
5323     CarryIn = CarryOut;
5324   }
5325   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5326               makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5327               makeArrayRef(DstRegs).drop_front(NarrowParts));
5328 
5329   MI.eraseFromParent();
5330   return Legalized;
5331 }
5332 
5333 LegalizerHelper::LegalizeResult
5334 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5335   Register DstReg = MI.getOperand(0).getReg();
5336   Register Src1 = MI.getOperand(1).getReg();
5337   Register Src2 = MI.getOperand(2).getReg();
5338 
5339   LLT Ty = MRI.getType(DstReg);
5340   if (Ty.isVector())
5341     return UnableToLegalize;
5342 
5343   unsigned SrcSize = MRI.getType(Src1).getSizeInBits();
5344   unsigned DstSize = Ty.getSizeInBits();
5345   unsigned NarrowSize = NarrowTy.getSizeInBits();
5346   if (DstSize % NarrowSize != 0 || SrcSize % NarrowSize != 0)
5347     return UnableToLegalize;
5348 
5349   unsigned NumDstParts = DstSize / NarrowSize;
5350   unsigned NumSrcParts = SrcSize / NarrowSize;
5351   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5352   unsigned DstTmpParts = NumDstParts * (IsMulHigh ? 2 : 1);
5353 
5354   SmallVector<Register, 2> Src1Parts, Src2Parts;
5355   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5356   extractParts(Src1, NarrowTy, NumSrcParts, Src1Parts);
5357   extractParts(Src2, NarrowTy, NumSrcParts, Src2Parts);
5358   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5359 
5360   // Take only high half of registers if this is high mul.
5361   ArrayRef<Register> DstRegs(
5362       IsMulHigh ? &DstTmpRegs[DstTmpParts / 2] : &DstTmpRegs[0], NumDstParts);
5363   MIRBuilder.buildMerge(DstReg, DstRegs);
5364   MI.eraseFromParent();
5365   return Legalized;
5366 }
5367 
5368 LegalizerHelper::LegalizeResult
5369 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5370                                    LLT NarrowTy) {
5371   if (TypeIdx != 0)
5372     return UnableToLegalize;
5373 
5374   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5375 
5376   Register Src = MI.getOperand(1).getReg();
5377   LLT SrcTy = MRI.getType(Src);
5378 
5379   // If all finite floats fit into the narrowed integer type, we can just swap
5380   // out the result type. This is practically only useful for conversions from
5381   // half to at least 16-bits, so just handle the one case.
5382   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5383       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5384     return UnableToLegalize;
5385 
5386   Observer.changingInstr(MI);
5387   narrowScalarDst(MI, NarrowTy, 0,
5388                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5389   Observer.changedInstr(MI);
5390   return Legalized;
5391 }
5392 
5393 LegalizerHelper::LegalizeResult
5394 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5395                                      LLT NarrowTy) {
5396   if (TypeIdx != 1)
5397     return UnableToLegalize;
5398 
5399   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5400 
5401   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5402   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5403   // NarrowSize.
5404   if (SizeOp1 % NarrowSize != 0)
5405     return UnableToLegalize;
5406   int NumParts = SizeOp1 / NarrowSize;
5407 
5408   SmallVector<Register, 2> SrcRegs, DstRegs;
5409   SmallVector<uint64_t, 2> Indexes;
5410   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5411 
5412   Register OpReg = MI.getOperand(0).getReg();
5413   uint64_t OpStart = MI.getOperand(2).getImm();
5414   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5415   for (int i = 0; i < NumParts; ++i) {
5416     unsigned SrcStart = i * NarrowSize;
5417 
5418     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5419       // No part of the extract uses this subregister, ignore it.
5420       continue;
5421     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5422       // The entire subregister is extracted, forward the value.
5423       DstRegs.push_back(SrcRegs[i]);
5424       continue;
5425     }
5426 
5427     // OpSegStart is where this destination segment would start in OpReg if it
5428     // extended infinitely in both directions.
5429     int64_t ExtractOffset;
5430     uint64_t SegSize;
5431     if (OpStart < SrcStart) {
5432       ExtractOffset = 0;
5433       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5434     } else {
5435       ExtractOffset = OpStart - SrcStart;
5436       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5437     }
5438 
5439     Register SegReg = SrcRegs[i];
5440     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5441       // A genuine extract is needed.
5442       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5443       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5444     }
5445 
5446     DstRegs.push_back(SegReg);
5447   }
5448 
5449   Register DstReg = MI.getOperand(0).getReg();
5450   if (MRI.getType(DstReg).isVector())
5451     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5452   else if (DstRegs.size() > 1)
5453     MIRBuilder.buildMerge(DstReg, DstRegs);
5454   else
5455     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5456   MI.eraseFromParent();
5457   return Legalized;
5458 }
5459 
5460 LegalizerHelper::LegalizeResult
5461 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5462                                     LLT NarrowTy) {
5463   // FIXME: Don't know how to handle secondary types yet.
5464   if (TypeIdx != 0)
5465     return UnableToLegalize;
5466 
5467   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5468   SmallVector<uint64_t, 2> Indexes;
5469   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5470   LLT LeftoverTy;
5471   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5472                LeftoverRegs);
5473 
5474   for (Register Reg : LeftoverRegs)
5475     SrcRegs.push_back(Reg);
5476 
5477   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5478   Register OpReg = MI.getOperand(2).getReg();
5479   uint64_t OpStart = MI.getOperand(3).getImm();
5480   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5481   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5482     unsigned DstStart = I * NarrowSize;
5483 
5484     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5485       // The entire subregister is defined by this insert, forward the new
5486       // value.
5487       DstRegs.push_back(OpReg);
5488       continue;
5489     }
5490 
5491     Register SrcReg = SrcRegs[I];
5492     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5493       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5494       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5495       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5496     }
5497 
5498     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5499       // No part of the insert affects this subregister, forward the original.
5500       DstRegs.push_back(SrcReg);
5501       continue;
5502     }
5503 
5504     // OpSegStart is where this destination segment would start in OpReg if it
5505     // extended infinitely in both directions.
5506     int64_t ExtractOffset, InsertOffset;
5507     uint64_t SegSize;
5508     if (OpStart < DstStart) {
5509       InsertOffset = 0;
5510       ExtractOffset = DstStart - OpStart;
5511       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5512     } else {
5513       InsertOffset = OpStart - DstStart;
5514       ExtractOffset = 0;
5515       SegSize =
5516         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5517     }
5518 
5519     Register SegReg = OpReg;
5520     if (ExtractOffset != 0 || SegSize != OpSize) {
5521       // A genuine extract is needed.
5522       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5523       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5524     }
5525 
5526     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5527     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5528     DstRegs.push_back(DstReg);
5529   }
5530 
5531   uint64_t WideSize = DstRegs.size() * NarrowSize;
5532   Register DstReg = MI.getOperand(0).getReg();
5533   if (WideSize > RegTy.getSizeInBits()) {
5534     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5535     MIRBuilder.buildMerge(MergeReg, DstRegs);
5536     MIRBuilder.buildTrunc(DstReg, MergeReg);
5537   } else
5538     MIRBuilder.buildMerge(DstReg, DstRegs);
5539 
5540   MI.eraseFromParent();
5541   return Legalized;
5542 }
5543 
5544 LegalizerHelper::LegalizeResult
5545 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5546                                    LLT NarrowTy) {
5547   Register DstReg = MI.getOperand(0).getReg();
5548   LLT DstTy = MRI.getType(DstReg);
5549 
5550   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5551 
5552   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5553   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5554   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5555   LLT LeftoverTy;
5556   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5557                     Src0Regs, Src0LeftoverRegs))
5558     return UnableToLegalize;
5559 
5560   LLT Unused;
5561   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5562                     Src1Regs, Src1LeftoverRegs))
5563     llvm_unreachable("inconsistent extractParts result");
5564 
5565   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5566     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5567                                         {Src0Regs[I], Src1Regs[I]});
5568     DstRegs.push_back(Inst.getReg(0));
5569   }
5570 
5571   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5572     auto Inst = MIRBuilder.buildInstr(
5573       MI.getOpcode(),
5574       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5575     DstLeftoverRegs.push_back(Inst.getReg(0));
5576   }
5577 
5578   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5579               LeftoverTy, DstLeftoverRegs);
5580 
5581   MI.eraseFromParent();
5582   return Legalized;
5583 }
5584 
5585 LegalizerHelper::LegalizeResult
5586 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5587                                  LLT NarrowTy) {
5588   if (TypeIdx != 0)
5589     return UnableToLegalize;
5590 
5591   Register DstReg = MI.getOperand(0).getReg();
5592   Register SrcReg = MI.getOperand(1).getReg();
5593 
5594   LLT DstTy = MRI.getType(DstReg);
5595   if (DstTy.isVector())
5596     return UnableToLegalize;
5597 
5598   SmallVector<Register, 8> Parts;
5599   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5600   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5601   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5602 
5603   MI.eraseFromParent();
5604   return Legalized;
5605 }
5606 
5607 LegalizerHelper::LegalizeResult
5608 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5609                                     LLT NarrowTy) {
5610   if (TypeIdx != 0)
5611     return UnableToLegalize;
5612 
5613   Register CondReg = MI.getOperand(1).getReg();
5614   LLT CondTy = MRI.getType(CondReg);
5615   if (CondTy.isVector()) // TODO: Handle vselect
5616     return UnableToLegalize;
5617 
5618   Register DstReg = MI.getOperand(0).getReg();
5619   LLT DstTy = MRI.getType(DstReg);
5620 
5621   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5622   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5623   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5624   LLT LeftoverTy;
5625   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5626                     Src1Regs, Src1LeftoverRegs))
5627     return UnableToLegalize;
5628 
5629   LLT Unused;
5630   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5631                     Src2Regs, Src2LeftoverRegs))
5632     llvm_unreachable("inconsistent extractParts result");
5633 
5634   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5635     auto Select = MIRBuilder.buildSelect(NarrowTy,
5636                                          CondReg, Src1Regs[I], Src2Regs[I]);
5637     DstRegs.push_back(Select.getReg(0));
5638   }
5639 
5640   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5641     auto Select = MIRBuilder.buildSelect(
5642       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5643     DstLeftoverRegs.push_back(Select.getReg(0));
5644   }
5645 
5646   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5647               LeftoverTy, DstLeftoverRegs);
5648 
5649   MI.eraseFromParent();
5650   return Legalized;
5651 }
5652 
5653 LegalizerHelper::LegalizeResult
5654 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5655                                   LLT NarrowTy) {
5656   if (TypeIdx != 1)
5657     return UnableToLegalize;
5658 
5659   Register DstReg = MI.getOperand(0).getReg();
5660   Register SrcReg = MI.getOperand(1).getReg();
5661   LLT DstTy = MRI.getType(DstReg);
5662   LLT SrcTy = MRI.getType(SrcReg);
5663   unsigned NarrowSize = NarrowTy.getSizeInBits();
5664 
5665   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5666     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5667 
5668     MachineIRBuilder &B = MIRBuilder;
5669     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5670     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5671     auto C_0 = B.buildConstant(NarrowTy, 0);
5672     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5673                                 UnmergeSrc.getReg(1), C_0);
5674     auto LoCTLZ = IsUndef ?
5675       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5676       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5677     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5678     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5679     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5680     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5681 
5682     MI.eraseFromParent();
5683     return Legalized;
5684   }
5685 
5686   return UnableToLegalize;
5687 }
5688 
5689 LegalizerHelper::LegalizeResult
5690 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5691                                   LLT NarrowTy) {
5692   if (TypeIdx != 1)
5693     return UnableToLegalize;
5694 
5695   Register DstReg = MI.getOperand(0).getReg();
5696   Register SrcReg = MI.getOperand(1).getReg();
5697   LLT DstTy = MRI.getType(DstReg);
5698   LLT SrcTy = MRI.getType(SrcReg);
5699   unsigned NarrowSize = NarrowTy.getSizeInBits();
5700 
5701   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5702     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5703 
5704     MachineIRBuilder &B = MIRBuilder;
5705     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5706     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5707     auto C_0 = B.buildConstant(NarrowTy, 0);
5708     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5709                                 UnmergeSrc.getReg(0), C_0);
5710     auto HiCTTZ = IsUndef ?
5711       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5712       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5713     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5714     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5715     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5716     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5717 
5718     MI.eraseFromParent();
5719     return Legalized;
5720   }
5721 
5722   return UnableToLegalize;
5723 }
5724 
5725 LegalizerHelper::LegalizeResult
5726 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5727                                    LLT NarrowTy) {
5728   if (TypeIdx != 1)
5729     return UnableToLegalize;
5730 
5731   Register DstReg = MI.getOperand(0).getReg();
5732   LLT DstTy = MRI.getType(DstReg);
5733   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5734   unsigned NarrowSize = NarrowTy.getSizeInBits();
5735 
5736   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5737     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5738 
5739     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5740     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5741     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5742 
5743     MI.eraseFromParent();
5744     return Legalized;
5745   }
5746 
5747   return UnableToLegalize;
5748 }
5749 
5750 LegalizerHelper::LegalizeResult
5751 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5752   unsigned Opc = MI.getOpcode();
5753   const auto &TII = MIRBuilder.getTII();
5754   auto isSupported = [this](const LegalityQuery &Q) {
5755     auto QAction = LI.getAction(Q).Action;
5756     return QAction == Legal || QAction == Libcall || QAction == Custom;
5757   };
5758   switch (Opc) {
5759   default:
5760     return UnableToLegalize;
5761   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5762     // This trivially expands to CTLZ.
5763     Observer.changingInstr(MI);
5764     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5765     Observer.changedInstr(MI);
5766     return Legalized;
5767   }
5768   case TargetOpcode::G_CTLZ: {
5769     Register DstReg = MI.getOperand(0).getReg();
5770     Register SrcReg = MI.getOperand(1).getReg();
5771     LLT DstTy = MRI.getType(DstReg);
5772     LLT SrcTy = MRI.getType(SrcReg);
5773     unsigned Len = SrcTy.getSizeInBits();
5774 
5775     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5776       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5777       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5778       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5779       auto ICmp = MIRBuilder.buildICmp(
5780           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5781       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5782       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5783       MI.eraseFromParent();
5784       return Legalized;
5785     }
5786     // for now, we do this:
5787     // NewLen = NextPowerOf2(Len);
5788     // x = x | (x >> 1);
5789     // x = x | (x >> 2);
5790     // ...
5791     // x = x | (x >>16);
5792     // x = x | (x >>32); // for 64-bit input
5793     // Upto NewLen/2
5794     // return Len - popcount(x);
5795     //
5796     // Ref: "Hacker's Delight" by Henry Warren
5797     Register Op = SrcReg;
5798     unsigned NewLen = PowerOf2Ceil(Len);
5799     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5800       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5801       auto MIBOp = MIRBuilder.buildOr(
5802           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5803       Op = MIBOp.getReg(0);
5804     }
5805     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5806     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5807                         MIBPop);
5808     MI.eraseFromParent();
5809     return Legalized;
5810   }
5811   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5812     // This trivially expands to CTTZ.
5813     Observer.changingInstr(MI);
5814     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5815     Observer.changedInstr(MI);
5816     return Legalized;
5817   }
5818   case TargetOpcode::G_CTTZ: {
5819     Register DstReg = MI.getOperand(0).getReg();
5820     Register SrcReg = MI.getOperand(1).getReg();
5821     LLT DstTy = MRI.getType(DstReg);
5822     LLT SrcTy = MRI.getType(SrcReg);
5823 
5824     unsigned Len = SrcTy.getSizeInBits();
5825     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5826       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5827       // zero.
5828       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5829       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5830       auto ICmp = MIRBuilder.buildICmp(
5831           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5832       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5833       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5834       MI.eraseFromParent();
5835       return Legalized;
5836     }
5837     // for now, we use: { return popcount(~x & (x - 1)); }
5838     // unless the target has ctlz but not ctpop, in which case we use:
5839     // { return 32 - nlz(~x & (x-1)); }
5840     // Ref: "Hacker's Delight" by Henry Warren
5841     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5842     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5843     auto MIBTmp = MIRBuilder.buildAnd(
5844         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5845     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5846         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5847       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5848       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5849                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5850       MI.eraseFromParent();
5851       return Legalized;
5852     }
5853     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5854     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5855     return Legalized;
5856   }
5857   case TargetOpcode::G_CTPOP: {
5858     Register SrcReg = MI.getOperand(1).getReg();
5859     LLT Ty = MRI.getType(SrcReg);
5860     unsigned Size = Ty.getSizeInBits();
5861     MachineIRBuilder &B = MIRBuilder;
5862 
5863     // Count set bits in blocks of 2 bits. Default approach would be
5864     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5865     // We use following formula instead:
5866     // B2Count = val - { (val >> 1) & 0x55555555 }
5867     // since it gives same result in blocks of 2 with one instruction less.
5868     auto C_1 = B.buildConstant(Ty, 1);
5869     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5870     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5871     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5872     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5873     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5874 
5875     // In order to get count in blocks of 4 add values from adjacent block of 2.
5876     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5877     auto C_2 = B.buildConstant(Ty, 2);
5878     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5879     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5880     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5881     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5882     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5883     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5884 
5885     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5886     // addition since count value sits in range {0,...,8} and 4 bits are enough
5887     // to hold such binary values. After addition high 4 bits still hold count
5888     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5889     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5890     auto C_4 = B.buildConstant(Ty, 4);
5891     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5892     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5893     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5894     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5895     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5896 
5897     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5898     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5899     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5900     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5901     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5902 
5903     // Shift count result from 8 high bits to low bits.
5904     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5905     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5906 
5907     MI.eraseFromParent();
5908     return Legalized;
5909   }
5910   }
5911 }
5912 
5913 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5914 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5915                                         Register Reg, unsigned BW) {
5916   return matchUnaryPredicate(
5917       MRI, Reg,
5918       [=](const Constant *C) {
5919         // Null constant here means an undef.
5920         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5921         return !CI || CI->getValue().urem(BW) != 0;
5922       },
5923       /*AllowUndefs*/ true);
5924 }
5925 
5926 LegalizerHelper::LegalizeResult
5927 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5928   Register Dst = MI.getOperand(0).getReg();
5929   Register X = MI.getOperand(1).getReg();
5930   Register Y = MI.getOperand(2).getReg();
5931   Register Z = MI.getOperand(3).getReg();
5932   LLT Ty = MRI.getType(Dst);
5933   LLT ShTy = MRI.getType(Z);
5934 
5935   unsigned BW = Ty.getScalarSizeInBits();
5936 
5937   if (!isPowerOf2_32(BW))
5938     return UnableToLegalize;
5939 
5940   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5941   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5942 
5943   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5944     // fshl X, Y, Z -> fshr X, Y, -Z
5945     // fshr X, Y, Z -> fshl X, Y, -Z
5946     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5947     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5948   } else {
5949     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5950     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5951     auto One = MIRBuilder.buildConstant(ShTy, 1);
5952     if (IsFSHL) {
5953       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5954       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5955     } else {
5956       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5957       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5958     }
5959 
5960     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5961   }
5962 
5963   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5964   MI.eraseFromParent();
5965   return Legalized;
5966 }
5967 
5968 LegalizerHelper::LegalizeResult
5969 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5970   Register Dst = MI.getOperand(0).getReg();
5971   Register X = MI.getOperand(1).getReg();
5972   Register Y = MI.getOperand(2).getReg();
5973   Register Z = MI.getOperand(3).getReg();
5974   LLT Ty = MRI.getType(Dst);
5975   LLT ShTy = MRI.getType(Z);
5976 
5977   const unsigned BW = Ty.getScalarSizeInBits();
5978   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5979 
5980   Register ShX, ShY;
5981   Register ShAmt, InvShAmt;
5982 
5983   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5984   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5985     // fshl: X << C | Y >> (BW - C)
5986     // fshr: X << (BW - C) | Y >> C
5987     // where C = Z % BW is not zero
5988     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5989     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5990     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5991     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5992     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5993   } else {
5994     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5995     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5996     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5997     if (isPowerOf2_32(BW)) {
5998       // Z % BW -> Z & (BW - 1)
5999       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
6000       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
6001       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
6002       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
6003     } else {
6004       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
6005       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
6006       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
6007     }
6008 
6009     auto One = MIRBuilder.buildConstant(ShTy, 1);
6010     if (IsFSHL) {
6011       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
6012       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
6013       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
6014     } else {
6015       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
6016       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
6017       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
6018     }
6019   }
6020 
6021   MIRBuilder.buildOr(Dst, ShX, ShY);
6022   MI.eraseFromParent();
6023   return Legalized;
6024 }
6025 
6026 LegalizerHelper::LegalizeResult
6027 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
6028   // These operations approximately do the following (while avoiding undefined
6029   // shifts by BW):
6030   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
6031   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
6032   Register Dst = MI.getOperand(0).getReg();
6033   LLT Ty = MRI.getType(Dst);
6034   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
6035 
6036   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6037   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6038 
6039   // TODO: Use smarter heuristic that accounts for vector legalization.
6040   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
6041     return lowerFunnelShiftAsShifts(MI);
6042 
6043   // This only works for powers of 2, fallback to shifts if it fails.
6044   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
6045   if (Result == UnableToLegalize)
6046     return lowerFunnelShiftAsShifts(MI);
6047   return Result;
6048 }
6049 
6050 LegalizerHelper::LegalizeResult
6051 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
6052   Register Dst = MI.getOperand(0).getReg();
6053   Register Src = MI.getOperand(1).getReg();
6054   Register Amt = MI.getOperand(2).getReg();
6055   LLT AmtTy = MRI.getType(Amt);
6056   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6057   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6058   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6059   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6060   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
6061   MI.eraseFromParent();
6062   return Legalized;
6063 }
6064 
6065 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
6066   Register Dst = MI.getOperand(0).getReg();
6067   Register Src = MI.getOperand(1).getReg();
6068   Register Amt = MI.getOperand(2).getReg();
6069   LLT DstTy = MRI.getType(Dst);
6070   LLT SrcTy = MRI.getType(Dst);
6071   LLT AmtTy = MRI.getType(Amt);
6072 
6073   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
6074   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6075 
6076   MIRBuilder.setInstrAndDebugLoc(MI);
6077 
6078   // If a rotate in the other direction is supported, use it.
6079   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6080   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
6081       isPowerOf2_32(EltSizeInBits))
6082     return lowerRotateWithReverseRotate(MI);
6083 
6084   // If a funnel shift is supported, use it.
6085   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6086   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6087   bool IsFShLegal = false;
6088   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
6089       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
6090     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
6091                                 Register R3) {
6092       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
6093       MI.eraseFromParent();
6094       return Legalized;
6095     };
6096     // If a funnel shift in the other direction is supported, use it.
6097     if (IsFShLegal) {
6098       return buildFunnelShift(FShOpc, Dst, Src, Amt);
6099     } else if (isPowerOf2_32(EltSizeInBits)) {
6100       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
6101       return buildFunnelShift(RevFsh, Dst, Src, Amt);
6102     }
6103   }
6104 
6105   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6106   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
6107   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
6108   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
6109   Register ShVal;
6110   Register RevShiftVal;
6111   if (isPowerOf2_32(EltSizeInBits)) {
6112     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6113     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6114     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6115     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
6116     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6117     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
6118     RevShiftVal =
6119         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
6120   } else {
6121     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6122     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6123     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
6124     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
6125     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6126     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
6127     auto One = MIRBuilder.buildConstant(AmtTy, 1);
6128     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
6129     RevShiftVal =
6130         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
6131   }
6132   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
6133   MI.eraseFromParent();
6134   return Legalized;
6135 }
6136 
6137 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6138 // representation.
6139 LegalizerHelper::LegalizeResult
6140 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
6141   Register Dst = MI.getOperand(0).getReg();
6142   Register Src = MI.getOperand(1).getReg();
6143   const LLT S64 = LLT::scalar(64);
6144   const LLT S32 = LLT::scalar(32);
6145   const LLT S1 = LLT::scalar(1);
6146 
6147   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
6148 
6149   // unsigned cul2f(ulong u) {
6150   //   uint lz = clz(u);
6151   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
6152   //   u = (u << lz) & 0x7fffffffffffffffUL;
6153   //   ulong t = u & 0xffffffffffUL;
6154   //   uint v = (e << 23) | (uint)(u >> 40);
6155   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6156   //   return as_float(v + r);
6157   // }
6158 
6159   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
6160   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
6161 
6162   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
6163 
6164   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
6165   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
6166 
6167   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
6168   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
6169 
6170   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
6171   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
6172 
6173   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
6174 
6175   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
6176   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
6177 
6178   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
6179   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
6180   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
6181 
6182   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
6183   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
6184   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
6185   auto One = MIRBuilder.buildConstant(S32, 1);
6186 
6187   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
6188   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
6189   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
6190   MIRBuilder.buildAdd(Dst, V, R);
6191 
6192   MI.eraseFromParent();
6193   return Legalized;
6194 }
6195 
6196 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
6197   Register Dst = MI.getOperand(0).getReg();
6198   Register Src = MI.getOperand(1).getReg();
6199   LLT DstTy = MRI.getType(Dst);
6200   LLT SrcTy = MRI.getType(Src);
6201 
6202   if (SrcTy == LLT::scalar(1)) {
6203     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6204     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6205     MIRBuilder.buildSelect(Dst, Src, True, False);
6206     MI.eraseFromParent();
6207     return Legalized;
6208   }
6209 
6210   if (SrcTy != LLT::scalar(64))
6211     return UnableToLegalize;
6212 
6213   if (DstTy == LLT::scalar(32)) {
6214     // TODO: SelectionDAG has several alternative expansions to port which may
6215     // be more reasonble depending on the available instructions. If a target
6216     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6217     // intermediate type, this is probably worse.
6218     return lowerU64ToF32BitOps(MI);
6219   }
6220 
6221   return UnableToLegalize;
6222 }
6223 
6224 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6225   Register Dst = MI.getOperand(0).getReg();
6226   Register Src = MI.getOperand(1).getReg();
6227   LLT DstTy = MRI.getType(Dst);
6228   LLT SrcTy = MRI.getType(Src);
6229 
6230   const LLT S64 = LLT::scalar(64);
6231   const LLT S32 = LLT::scalar(32);
6232   const LLT S1 = LLT::scalar(1);
6233 
6234   if (SrcTy == S1) {
6235     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6236     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6237     MIRBuilder.buildSelect(Dst, Src, True, False);
6238     MI.eraseFromParent();
6239     return Legalized;
6240   }
6241 
6242   if (SrcTy != S64)
6243     return UnableToLegalize;
6244 
6245   if (DstTy == S32) {
6246     // signed cl2f(long l) {
6247     //   long s = l >> 63;
6248     //   float r = cul2f((l + s) ^ s);
6249     //   return s ? -r : r;
6250     // }
6251     Register L = Src;
6252     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6253     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6254 
6255     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6256     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6257     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6258 
6259     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6260     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6261                                             MIRBuilder.buildConstant(S64, 0));
6262     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6263     MI.eraseFromParent();
6264     return Legalized;
6265   }
6266 
6267   return UnableToLegalize;
6268 }
6269 
6270 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6271   Register Dst = MI.getOperand(0).getReg();
6272   Register Src = MI.getOperand(1).getReg();
6273   LLT DstTy = MRI.getType(Dst);
6274   LLT SrcTy = MRI.getType(Src);
6275   const LLT S64 = LLT::scalar(64);
6276   const LLT S32 = LLT::scalar(32);
6277 
6278   if (SrcTy != S64 && SrcTy != S32)
6279     return UnableToLegalize;
6280   if (DstTy != S32 && DstTy != S64)
6281     return UnableToLegalize;
6282 
6283   // FPTOSI gives same result as FPTOUI for positive signed integers.
6284   // FPTOUI needs to deal with fp values that convert to unsigned integers
6285   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6286 
6287   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6288   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6289                                                 : APFloat::IEEEdouble(),
6290                     APInt::getNullValue(SrcTy.getSizeInBits()));
6291   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6292 
6293   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6294 
6295   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6296   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6297   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6298   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6299   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6300   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6301   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6302 
6303   const LLT S1 = LLT::scalar(1);
6304 
6305   MachineInstrBuilder FCMP =
6306       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6307   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6308 
6309   MI.eraseFromParent();
6310   return Legalized;
6311 }
6312 
6313 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6314   Register Dst = MI.getOperand(0).getReg();
6315   Register Src = MI.getOperand(1).getReg();
6316   LLT DstTy = MRI.getType(Dst);
6317   LLT SrcTy = MRI.getType(Src);
6318   const LLT S64 = LLT::scalar(64);
6319   const LLT S32 = LLT::scalar(32);
6320 
6321   // FIXME: Only f32 to i64 conversions are supported.
6322   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6323     return UnableToLegalize;
6324 
6325   // Expand f32 -> i64 conversion
6326   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6327   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6328 
6329   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6330 
6331   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6332   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6333 
6334   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6335   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6336 
6337   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6338                                            APInt::getSignMask(SrcEltBits));
6339   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6340   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6341   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6342   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6343 
6344   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6345   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6346   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6347 
6348   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6349   R = MIRBuilder.buildZExt(DstTy, R);
6350 
6351   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6352   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6353   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6354   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6355 
6356   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6357   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6358 
6359   const LLT S1 = LLT::scalar(1);
6360   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6361                                     S1, Exponent, ExponentLoBit);
6362 
6363   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6364 
6365   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6366   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6367 
6368   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6369 
6370   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6371                                           S1, Exponent, ZeroSrcTy);
6372 
6373   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6374   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6375 
6376   MI.eraseFromParent();
6377   return Legalized;
6378 }
6379 
6380 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6381 LegalizerHelper::LegalizeResult
6382 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6383   Register Dst = MI.getOperand(0).getReg();
6384   Register Src = MI.getOperand(1).getReg();
6385 
6386   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6387     return UnableToLegalize;
6388 
6389   const unsigned ExpMask = 0x7ff;
6390   const unsigned ExpBiasf64 = 1023;
6391   const unsigned ExpBiasf16 = 15;
6392   const LLT S32 = LLT::scalar(32);
6393   const LLT S1 = LLT::scalar(1);
6394 
6395   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6396   Register U = Unmerge.getReg(0);
6397   Register UH = Unmerge.getReg(1);
6398 
6399   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6400   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6401 
6402   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6403   // add the f16 bias (15) to get the biased exponent for the f16 format.
6404   E = MIRBuilder.buildAdd(
6405     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6406 
6407   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6408   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6409 
6410   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6411                                        MIRBuilder.buildConstant(S32, 0x1ff));
6412   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6413 
6414   auto Zero = MIRBuilder.buildConstant(S32, 0);
6415   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6416   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6417   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6418 
6419   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6420   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6421   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6422   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6423 
6424   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6425   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6426 
6427   // N = M | (E << 12);
6428   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6429   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6430 
6431   // B = clamp(1-E, 0, 13);
6432   auto One = MIRBuilder.buildConstant(S32, 1);
6433   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6434   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6435   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6436 
6437   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6438                                        MIRBuilder.buildConstant(S32, 0x1000));
6439 
6440   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6441   auto D0 = MIRBuilder.buildShl(S32, D, B);
6442 
6443   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6444                                              D0, SigSetHigh);
6445   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6446   D = MIRBuilder.buildOr(S32, D, D1);
6447 
6448   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6449   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6450 
6451   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6452   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6453 
6454   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6455                                        MIRBuilder.buildConstant(S32, 3));
6456   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6457 
6458   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6459                                        MIRBuilder.buildConstant(S32, 5));
6460   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6461 
6462   V1 = MIRBuilder.buildOr(S32, V0, V1);
6463   V = MIRBuilder.buildAdd(S32, V, V1);
6464 
6465   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6466                                        E, MIRBuilder.buildConstant(S32, 30));
6467   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6468                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6469 
6470   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6471                                          E, MIRBuilder.buildConstant(S32, 1039));
6472   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6473 
6474   // Extract the sign bit.
6475   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6476   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6477 
6478   // Insert the sign bit
6479   V = MIRBuilder.buildOr(S32, Sign, V);
6480 
6481   MIRBuilder.buildTrunc(Dst, V);
6482   MI.eraseFromParent();
6483   return Legalized;
6484 }
6485 
6486 LegalizerHelper::LegalizeResult
6487 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6488   Register Dst = MI.getOperand(0).getReg();
6489   Register Src = MI.getOperand(1).getReg();
6490 
6491   LLT DstTy = MRI.getType(Dst);
6492   LLT SrcTy = MRI.getType(Src);
6493   const LLT S64 = LLT::scalar(64);
6494   const LLT S16 = LLT::scalar(16);
6495 
6496   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6497     return lowerFPTRUNC_F64_TO_F16(MI);
6498 
6499   return UnableToLegalize;
6500 }
6501 
6502 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6503 // multiplication tree.
6504 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6505   Register Dst = MI.getOperand(0).getReg();
6506   Register Src0 = MI.getOperand(1).getReg();
6507   Register Src1 = MI.getOperand(2).getReg();
6508   LLT Ty = MRI.getType(Dst);
6509 
6510   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6511   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6512   MI.eraseFromParent();
6513   return Legalized;
6514 }
6515 
6516 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6517   switch (Opc) {
6518   case TargetOpcode::G_SMIN:
6519     return CmpInst::ICMP_SLT;
6520   case TargetOpcode::G_SMAX:
6521     return CmpInst::ICMP_SGT;
6522   case TargetOpcode::G_UMIN:
6523     return CmpInst::ICMP_ULT;
6524   case TargetOpcode::G_UMAX:
6525     return CmpInst::ICMP_UGT;
6526   default:
6527     llvm_unreachable("not in integer min/max");
6528   }
6529 }
6530 
6531 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6532   Register Dst = MI.getOperand(0).getReg();
6533   Register Src0 = MI.getOperand(1).getReg();
6534   Register Src1 = MI.getOperand(2).getReg();
6535 
6536   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6537   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6538 
6539   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6540   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6541 
6542   MI.eraseFromParent();
6543   return Legalized;
6544 }
6545 
6546 LegalizerHelper::LegalizeResult
6547 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6548   Register Dst = MI.getOperand(0).getReg();
6549   Register Src0 = MI.getOperand(1).getReg();
6550   Register Src1 = MI.getOperand(2).getReg();
6551 
6552   const LLT Src0Ty = MRI.getType(Src0);
6553   const LLT Src1Ty = MRI.getType(Src1);
6554 
6555   const int Src0Size = Src0Ty.getScalarSizeInBits();
6556   const int Src1Size = Src1Ty.getScalarSizeInBits();
6557 
6558   auto SignBitMask = MIRBuilder.buildConstant(
6559     Src0Ty, APInt::getSignMask(Src0Size));
6560 
6561   auto NotSignBitMask = MIRBuilder.buildConstant(
6562     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6563 
6564   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6565   Register And1;
6566   if (Src0Ty == Src1Ty) {
6567     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6568   } else if (Src0Size > Src1Size) {
6569     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6570     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6571     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6572     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6573   } else {
6574     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6575     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6576     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6577     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6578   }
6579 
6580   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6581   // constants are a nan and -0.0, but the final result should preserve
6582   // everything.
6583   unsigned Flags = MI.getFlags();
6584   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6585 
6586   MI.eraseFromParent();
6587   return Legalized;
6588 }
6589 
6590 LegalizerHelper::LegalizeResult
6591 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6592   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6593     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6594 
6595   Register Dst = MI.getOperand(0).getReg();
6596   Register Src0 = MI.getOperand(1).getReg();
6597   Register Src1 = MI.getOperand(2).getReg();
6598   LLT Ty = MRI.getType(Dst);
6599 
6600   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6601     // Insert canonicalizes if it's possible we need to quiet to get correct
6602     // sNaN behavior.
6603 
6604     // Note this must be done here, and not as an optimization combine in the
6605     // absence of a dedicate quiet-snan instruction as we're using an
6606     // omni-purpose G_FCANONICALIZE.
6607     if (!isKnownNeverSNaN(Src0, MRI))
6608       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6609 
6610     if (!isKnownNeverSNaN(Src1, MRI))
6611       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6612   }
6613 
6614   // If there are no nans, it's safe to simply replace this with the non-IEEE
6615   // version.
6616   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6617   MI.eraseFromParent();
6618   return Legalized;
6619 }
6620 
6621 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6622   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6623   Register DstReg = MI.getOperand(0).getReg();
6624   LLT Ty = MRI.getType(DstReg);
6625   unsigned Flags = MI.getFlags();
6626 
6627   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6628                                   Flags);
6629   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6630   MI.eraseFromParent();
6631   return Legalized;
6632 }
6633 
6634 LegalizerHelper::LegalizeResult
6635 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6636   Register DstReg = MI.getOperand(0).getReg();
6637   Register X = MI.getOperand(1).getReg();
6638   const unsigned Flags = MI.getFlags();
6639   const LLT Ty = MRI.getType(DstReg);
6640   const LLT CondTy = Ty.changeElementSize(1);
6641 
6642   // round(x) =>
6643   //  t = trunc(x);
6644   //  d = fabs(x - t);
6645   //  o = copysign(1.0f, x);
6646   //  return t + (d >= 0.5 ? o : 0.0);
6647 
6648   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6649 
6650   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6651   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6652   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6653   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6654   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6655   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6656 
6657   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6658                                   Flags);
6659   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6660 
6661   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6662 
6663   MI.eraseFromParent();
6664   return Legalized;
6665 }
6666 
6667 LegalizerHelper::LegalizeResult
6668 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6669   Register DstReg = MI.getOperand(0).getReg();
6670   Register SrcReg = MI.getOperand(1).getReg();
6671   unsigned Flags = MI.getFlags();
6672   LLT Ty = MRI.getType(DstReg);
6673   const LLT CondTy = Ty.changeElementSize(1);
6674 
6675   // result = trunc(src);
6676   // if (src < 0.0 && src != result)
6677   //   result += -1.0.
6678 
6679   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6680   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6681 
6682   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6683                                   SrcReg, Zero, Flags);
6684   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6685                                       SrcReg, Trunc, Flags);
6686   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6687   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6688 
6689   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6690   MI.eraseFromParent();
6691   return Legalized;
6692 }
6693 
6694 LegalizerHelper::LegalizeResult
6695 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6696   const unsigned NumOps = MI.getNumOperands();
6697   Register DstReg = MI.getOperand(0).getReg();
6698   Register Src0Reg = MI.getOperand(1).getReg();
6699   LLT DstTy = MRI.getType(DstReg);
6700   LLT SrcTy = MRI.getType(Src0Reg);
6701   unsigned PartSize = SrcTy.getSizeInBits();
6702 
6703   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6704   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6705 
6706   for (unsigned I = 2; I != NumOps; ++I) {
6707     const unsigned Offset = (I - 1) * PartSize;
6708 
6709     Register SrcReg = MI.getOperand(I).getReg();
6710     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6711 
6712     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6713       MRI.createGenericVirtualRegister(WideTy);
6714 
6715     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6716     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6717     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6718     ResultReg = NextResult;
6719   }
6720 
6721   if (DstTy.isPointer()) {
6722     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6723           DstTy.getAddressSpace())) {
6724       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6725       return UnableToLegalize;
6726     }
6727 
6728     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6729   }
6730 
6731   MI.eraseFromParent();
6732   return Legalized;
6733 }
6734 
6735 LegalizerHelper::LegalizeResult
6736 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6737   const unsigned NumDst = MI.getNumOperands() - 1;
6738   Register SrcReg = MI.getOperand(NumDst).getReg();
6739   Register Dst0Reg = MI.getOperand(0).getReg();
6740   LLT DstTy = MRI.getType(Dst0Reg);
6741   if (DstTy.isPointer())
6742     return UnableToLegalize; // TODO
6743 
6744   SrcReg = coerceToScalar(SrcReg);
6745   if (!SrcReg)
6746     return UnableToLegalize;
6747 
6748   // Expand scalarizing unmerge as bitcast to integer and shift.
6749   LLT IntTy = MRI.getType(SrcReg);
6750 
6751   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6752 
6753   const unsigned DstSize = DstTy.getSizeInBits();
6754   unsigned Offset = DstSize;
6755   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6756     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6757     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6758     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6759   }
6760 
6761   MI.eraseFromParent();
6762   return Legalized;
6763 }
6764 
6765 /// Lower a vector extract or insert by writing the vector to a stack temporary
6766 /// and reloading the element or vector.
6767 ///
6768 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6769 ///  =>
6770 ///  %stack_temp = G_FRAME_INDEX
6771 ///  G_STORE %vec, %stack_temp
6772 ///  %idx = clamp(%idx, %vec.getNumElements())
6773 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6774 ///  %dst = G_LOAD %element_ptr
6775 LegalizerHelper::LegalizeResult
6776 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6777   Register DstReg = MI.getOperand(0).getReg();
6778   Register SrcVec = MI.getOperand(1).getReg();
6779   Register InsertVal;
6780   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6781     InsertVal = MI.getOperand(2).getReg();
6782 
6783   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6784 
6785   LLT VecTy = MRI.getType(SrcVec);
6786   LLT EltTy = VecTy.getElementType();
6787   if (!EltTy.isByteSized()) { // Not implemented.
6788     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6789     return UnableToLegalize;
6790   }
6791 
6792   unsigned EltBytes = EltTy.getSizeInBytes();
6793   Align VecAlign = getStackTemporaryAlignment(VecTy);
6794   Align EltAlign;
6795 
6796   MachinePointerInfo PtrInfo;
6797   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6798                                         VecAlign, PtrInfo);
6799   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6800 
6801   // Get the pointer to the element, and be sure not to hit undefined behavior
6802   // if the index is out of bounds.
6803   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6804 
6805   int64_t IdxVal;
6806   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6807     int64_t Offset = IdxVal * EltBytes;
6808     PtrInfo = PtrInfo.getWithOffset(Offset);
6809     EltAlign = commonAlignment(VecAlign, Offset);
6810   } else {
6811     // We lose information with a variable offset.
6812     EltAlign = getStackTemporaryAlignment(EltTy);
6813     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6814   }
6815 
6816   if (InsertVal) {
6817     // Write the inserted element
6818     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6819 
6820     // Reload the whole vector.
6821     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6822   } else {
6823     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6824   }
6825 
6826   MI.eraseFromParent();
6827   return Legalized;
6828 }
6829 
6830 LegalizerHelper::LegalizeResult
6831 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6832   Register DstReg = MI.getOperand(0).getReg();
6833   Register Src0Reg = MI.getOperand(1).getReg();
6834   Register Src1Reg = MI.getOperand(2).getReg();
6835   LLT Src0Ty = MRI.getType(Src0Reg);
6836   LLT DstTy = MRI.getType(DstReg);
6837   LLT IdxTy = LLT::scalar(32);
6838 
6839   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6840 
6841   if (DstTy.isScalar()) {
6842     if (Src0Ty.isVector())
6843       return UnableToLegalize;
6844 
6845     // This is just a SELECT.
6846     assert(Mask.size() == 1 && "Expected a single mask element");
6847     Register Val;
6848     if (Mask[0] < 0 || Mask[0] > 1)
6849       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6850     else
6851       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6852     MIRBuilder.buildCopy(DstReg, Val);
6853     MI.eraseFromParent();
6854     return Legalized;
6855   }
6856 
6857   Register Undef;
6858   SmallVector<Register, 32> BuildVec;
6859   LLT EltTy = DstTy.getElementType();
6860 
6861   for (int Idx : Mask) {
6862     if (Idx < 0) {
6863       if (!Undef.isValid())
6864         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6865       BuildVec.push_back(Undef);
6866       continue;
6867     }
6868 
6869     if (Src0Ty.isScalar()) {
6870       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6871     } else {
6872       int NumElts = Src0Ty.getNumElements();
6873       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6874       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6875       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6876       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6877       BuildVec.push_back(Extract.getReg(0));
6878     }
6879   }
6880 
6881   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6882   MI.eraseFromParent();
6883   return Legalized;
6884 }
6885 
6886 LegalizerHelper::LegalizeResult
6887 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6888   const auto &MF = *MI.getMF();
6889   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6890   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6891     return UnableToLegalize;
6892 
6893   Register Dst = MI.getOperand(0).getReg();
6894   Register AllocSize = MI.getOperand(1).getReg();
6895   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6896 
6897   LLT PtrTy = MRI.getType(Dst);
6898   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6899 
6900   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6901   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6902   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6903 
6904   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6905   // have to generate an extra instruction to negate the alloc and then use
6906   // G_PTR_ADD to add the negative offset.
6907   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6908   if (Alignment > Align(1)) {
6909     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6910     AlignMask.negate();
6911     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6912     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6913   }
6914 
6915   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6916   MIRBuilder.buildCopy(SPReg, SPTmp);
6917   MIRBuilder.buildCopy(Dst, SPTmp);
6918 
6919   MI.eraseFromParent();
6920   return Legalized;
6921 }
6922 
6923 LegalizerHelper::LegalizeResult
6924 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6925   Register Dst = MI.getOperand(0).getReg();
6926   Register Src = MI.getOperand(1).getReg();
6927   unsigned Offset = MI.getOperand(2).getImm();
6928 
6929   LLT DstTy = MRI.getType(Dst);
6930   LLT SrcTy = MRI.getType(Src);
6931 
6932   if (DstTy.isScalar() &&
6933       (SrcTy.isScalar() ||
6934        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6935     LLT SrcIntTy = SrcTy;
6936     if (!SrcTy.isScalar()) {
6937       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6938       Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6939     }
6940 
6941     if (Offset == 0)
6942       MIRBuilder.buildTrunc(Dst, Src);
6943     else {
6944       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6945       auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6946       MIRBuilder.buildTrunc(Dst, Shr);
6947     }
6948 
6949     MI.eraseFromParent();
6950     return Legalized;
6951   }
6952 
6953   return UnableToLegalize;
6954 }
6955 
6956 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6957   Register Dst = MI.getOperand(0).getReg();
6958   Register Src = MI.getOperand(1).getReg();
6959   Register InsertSrc = MI.getOperand(2).getReg();
6960   uint64_t Offset = MI.getOperand(3).getImm();
6961 
6962   LLT DstTy = MRI.getType(Src);
6963   LLT InsertTy = MRI.getType(InsertSrc);
6964 
6965   if (InsertTy.isVector() ||
6966       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6967     return UnableToLegalize;
6968 
6969   const DataLayout &DL = MIRBuilder.getDataLayout();
6970   if ((DstTy.isPointer() &&
6971        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6972       (InsertTy.isPointer() &&
6973        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6974     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6975     return UnableToLegalize;
6976   }
6977 
6978   LLT IntDstTy = DstTy;
6979 
6980   if (!DstTy.isScalar()) {
6981     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6982     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6983   }
6984 
6985   if (!InsertTy.isScalar()) {
6986     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6987     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6988   }
6989 
6990   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6991   if (Offset != 0) {
6992     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6993     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6994   }
6995 
6996   APInt MaskVal = APInt::getBitsSetWithWrap(
6997       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6998 
6999   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
7000   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
7001   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
7002 
7003   MIRBuilder.buildCast(Dst, Or);
7004   MI.eraseFromParent();
7005   return Legalized;
7006 }
7007 
7008 LegalizerHelper::LegalizeResult
7009 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
7010   Register Dst0 = MI.getOperand(0).getReg();
7011   Register Dst1 = MI.getOperand(1).getReg();
7012   Register LHS = MI.getOperand(2).getReg();
7013   Register RHS = MI.getOperand(3).getReg();
7014   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
7015 
7016   LLT Ty = MRI.getType(Dst0);
7017   LLT BoolTy = MRI.getType(Dst1);
7018 
7019   if (IsAdd)
7020     MIRBuilder.buildAdd(Dst0, LHS, RHS);
7021   else
7022     MIRBuilder.buildSub(Dst0, LHS, RHS);
7023 
7024   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
7025 
7026   auto Zero = MIRBuilder.buildConstant(Ty, 0);
7027 
7028   // For an addition, the result should be less than one of the operands (LHS)
7029   // if and only if the other operand (RHS) is negative, otherwise there will
7030   // be overflow.
7031   // For a subtraction, the result should be less than one of the operands
7032   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
7033   // otherwise there will be overflow.
7034   auto ResultLowerThanLHS =
7035       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
7036   auto ConditionRHS = MIRBuilder.buildICmp(
7037       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
7038 
7039   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
7040   MI.eraseFromParent();
7041   return Legalized;
7042 }
7043 
7044 LegalizerHelper::LegalizeResult
7045 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
7046   Register Res = MI.getOperand(0).getReg();
7047   Register LHS = MI.getOperand(1).getReg();
7048   Register RHS = MI.getOperand(2).getReg();
7049   LLT Ty = MRI.getType(Res);
7050   bool IsSigned;
7051   bool IsAdd;
7052   unsigned BaseOp;
7053   switch (MI.getOpcode()) {
7054   default:
7055     llvm_unreachable("unexpected addsat/subsat opcode");
7056   case TargetOpcode::G_UADDSAT:
7057     IsSigned = false;
7058     IsAdd = true;
7059     BaseOp = TargetOpcode::G_ADD;
7060     break;
7061   case TargetOpcode::G_SADDSAT:
7062     IsSigned = true;
7063     IsAdd = true;
7064     BaseOp = TargetOpcode::G_ADD;
7065     break;
7066   case TargetOpcode::G_USUBSAT:
7067     IsSigned = false;
7068     IsAdd = false;
7069     BaseOp = TargetOpcode::G_SUB;
7070     break;
7071   case TargetOpcode::G_SSUBSAT:
7072     IsSigned = true;
7073     IsAdd = false;
7074     BaseOp = TargetOpcode::G_SUB;
7075     break;
7076   }
7077 
7078   if (IsSigned) {
7079     // sadd.sat(a, b) ->
7080     //   hi = 0x7fffffff - smax(a, 0)
7081     //   lo = 0x80000000 - smin(a, 0)
7082     //   a + smin(smax(lo, b), hi)
7083     // ssub.sat(a, b) ->
7084     //   lo = smax(a, -1) - 0x7fffffff
7085     //   hi = smin(a, -1) - 0x80000000
7086     //   a - smin(smax(lo, b), hi)
7087     // TODO: AMDGPU can use a "median of 3" instruction here:
7088     //   a +/- med3(lo, b, hi)
7089     uint64_t NumBits = Ty.getScalarSizeInBits();
7090     auto MaxVal =
7091         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
7092     auto MinVal =
7093         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7094     MachineInstrBuilder Hi, Lo;
7095     if (IsAdd) {
7096       auto Zero = MIRBuilder.buildConstant(Ty, 0);
7097       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
7098       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
7099     } else {
7100       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
7101       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
7102                                MaxVal);
7103       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
7104                                MinVal);
7105     }
7106     auto RHSClamped =
7107         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
7108     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
7109   } else {
7110     // uadd.sat(a, b) -> a + umin(~a, b)
7111     // usub.sat(a, b) -> a - umin(a, b)
7112     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
7113     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
7114     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
7115   }
7116 
7117   MI.eraseFromParent();
7118   return Legalized;
7119 }
7120 
7121 LegalizerHelper::LegalizeResult
7122 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7123   Register Res = MI.getOperand(0).getReg();
7124   Register LHS = MI.getOperand(1).getReg();
7125   Register RHS = MI.getOperand(2).getReg();
7126   LLT Ty = MRI.getType(Res);
7127   LLT BoolTy = Ty.changeElementSize(1);
7128   bool IsSigned;
7129   bool IsAdd;
7130   unsigned OverflowOp;
7131   switch (MI.getOpcode()) {
7132   default:
7133     llvm_unreachable("unexpected addsat/subsat opcode");
7134   case TargetOpcode::G_UADDSAT:
7135     IsSigned = false;
7136     IsAdd = true;
7137     OverflowOp = TargetOpcode::G_UADDO;
7138     break;
7139   case TargetOpcode::G_SADDSAT:
7140     IsSigned = true;
7141     IsAdd = true;
7142     OverflowOp = TargetOpcode::G_SADDO;
7143     break;
7144   case TargetOpcode::G_USUBSAT:
7145     IsSigned = false;
7146     IsAdd = false;
7147     OverflowOp = TargetOpcode::G_USUBO;
7148     break;
7149   case TargetOpcode::G_SSUBSAT:
7150     IsSigned = true;
7151     IsAdd = false;
7152     OverflowOp = TargetOpcode::G_SSUBO;
7153     break;
7154   }
7155 
7156   auto OverflowRes =
7157       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7158   Register Tmp = OverflowRes.getReg(0);
7159   Register Ov = OverflowRes.getReg(1);
7160   MachineInstrBuilder Clamp;
7161   if (IsSigned) {
7162     // sadd.sat(a, b) ->
7163     //   {tmp, ov} = saddo(a, b)
7164     //   ov ? (tmp >>s 31) + 0x80000000 : r
7165     // ssub.sat(a, b) ->
7166     //   {tmp, ov} = ssubo(a, b)
7167     //   ov ? (tmp >>s 31) + 0x80000000 : r
7168     uint64_t NumBits = Ty.getScalarSizeInBits();
7169     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7170     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7171     auto MinVal =
7172         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7173     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7174   } else {
7175     // uadd.sat(a, b) ->
7176     //   {tmp, ov} = uaddo(a, b)
7177     //   ov ? 0xffffffff : tmp
7178     // usub.sat(a, b) ->
7179     //   {tmp, ov} = usubo(a, b)
7180     //   ov ? 0 : tmp
7181     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7182   }
7183   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7184 
7185   MI.eraseFromParent();
7186   return Legalized;
7187 }
7188 
7189 LegalizerHelper::LegalizeResult
7190 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7191   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7192           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7193          "Expected shlsat opcode!");
7194   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7195   Register Res = MI.getOperand(0).getReg();
7196   Register LHS = MI.getOperand(1).getReg();
7197   Register RHS = MI.getOperand(2).getReg();
7198   LLT Ty = MRI.getType(Res);
7199   LLT BoolTy = Ty.changeElementSize(1);
7200 
7201   unsigned BW = Ty.getScalarSizeInBits();
7202   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7203   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7204                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7205 
7206   MachineInstrBuilder SatVal;
7207   if (IsSigned) {
7208     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7209     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7210     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7211                                     MIRBuilder.buildConstant(Ty, 0));
7212     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7213   } else {
7214     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7215   }
7216   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7217   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7218 
7219   MI.eraseFromParent();
7220   return Legalized;
7221 }
7222 
7223 LegalizerHelper::LegalizeResult
7224 LegalizerHelper::lowerBswap(MachineInstr &MI) {
7225   Register Dst = MI.getOperand(0).getReg();
7226   Register Src = MI.getOperand(1).getReg();
7227   const LLT Ty = MRI.getType(Src);
7228   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7229   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7230 
7231   // Swap most and least significant byte, set remaining bytes in Res to zero.
7232   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7233   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7234   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7235   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7236 
7237   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7238   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7239     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7240     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7241     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7242     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7243     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7244     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7245     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7246     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7247     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7248     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7249     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7250     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7251   }
7252   Res.getInstr()->getOperand(0).setReg(Dst);
7253 
7254   MI.eraseFromParent();
7255   return Legalized;
7256 }
7257 
7258 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7259 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7260                                  MachineInstrBuilder Src, APInt Mask) {
7261   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7262   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7263   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7264   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7265   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7266   return B.buildOr(Dst, LHS, RHS);
7267 }
7268 
7269 LegalizerHelper::LegalizeResult
7270 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7271   Register Dst = MI.getOperand(0).getReg();
7272   Register Src = MI.getOperand(1).getReg();
7273   const LLT Ty = MRI.getType(Src);
7274   unsigned Size = Ty.getSizeInBits();
7275 
7276   MachineInstrBuilder BSWAP =
7277       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7278 
7279   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7280   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7281   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7282   MachineInstrBuilder Swap4 =
7283       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7284 
7285   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7286   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7287   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7288   MachineInstrBuilder Swap2 =
7289       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7290 
7291   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7292   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7293   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7294   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7295 
7296   MI.eraseFromParent();
7297   return Legalized;
7298 }
7299 
7300 LegalizerHelper::LegalizeResult
7301 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7302   MachineFunction &MF = MIRBuilder.getMF();
7303 
7304   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7305   int NameOpIdx = IsRead ? 1 : 0;
7306   int ValRegIndex = IsRead ? 0 : 1;
7307 
7308   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7309   const LLT Ty = MRI.getType(ValReg);
7310   const MDString *RegStr = cast<MDString>(
7311     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7312 
7313   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7314   if (!PhysReg.isValid())
7315     return UnableToLegalize;
7316 
7317   if (IsRead)
7318     MIRBuilder.buildCopy(ValReg, PhysReg);
7319   else
7320     MIRBuilder.buildCopy(PhysReg, ValReg);
7321 
7322   MI.eraseFromParent();
7323   return Legalized;
7324 }
7325 
7326 LegalizerHelper::LegalizeResult
7327 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7328   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7329   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7330   Register Result = MI.getOperand(0).getReg();
7331   LLT OrigTy = MRI.getType(Result);
7332   auto SizeInBits = OrigTy.getScalarSizeInBits();
7333   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7334 
7335   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7336   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7337   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7338   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7339 
7340   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7341   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7342   MIRBuilder.buildTrunc(Result, Shifted);
7343 
7344   MI.eraseFromParent();
7345   return Legalized;
7346 }
7347 
7348 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7349   // Implement vector G_SELECT in terms of XOR, AND, OR.
7350   Register DstReg = MI.getOperand(0).getReg();
7351   Register MaskReg = MI.getOperand(1).getReg();
7352   Register Op1Reg = MI.getOperand(2).getReg();
7353   Register Op2Reg = MI.getOperand(3).getReg();
7354   LLT DstTy = MRI.getType(DstReg);
7355   LLT MaskTy = MRI.getType(MaskReg);
7356   LLT Op1Ty = MRI.getType(Op1Reg);
7357   if (!DstTy.isVector())
7358     return UnableToLegalize;
7359 
7360   // Vector selects can have a scalar predicate. If so, splat into a vector and
7361   // finish for later legalization attempts to try again.
7362   if (MaskTy.isScalar()) {
7363     Register MaskElt = MaskReg;
7364     if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
7365       MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
7366     // Generate a vector splat idiom to be pattern matched later.
7367     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7368     Observer.changingInstr(MI);
7369     MI.getOperand(1).setReg(ShufSplat.getReg(0));
7370     Observer.changedInstr(MI);
7371     return Legalized;
7372   }
7373 
7374   if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
7375     return UnableToLegalize;
7376   }
7377 
7378   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7379   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7380   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7381   MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7382   MI.eraseFromParent();
7383   return Legalized;
7384 }
7385 
7386 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7387   // Split DIVREM into individual instructions.
7388   unsigned Opcode = MI.getOpcode();
7389 
7390   MIRBuilder.buildInstr(
7391       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7392                                         : TargetOpcode::G_UDIV,
7393       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7394   MIRBuilder.buildInstr(
7395       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7396                                         : TargetOpcode::G_UREM,
7397       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7398   MI.eraseFromParent();
7399   return Legalized;
7400 }
7401 
7402 LegalizerHelper::LegalizeResult
7403 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7404   // Expand %res = G_ABS %a into:
7405   // %v1 = G_ASHR %a, scalar_size-1
7406   // %v2 = G_ADD %a, %v1
7407   // %res = G_XOR %v2, %v1
7408   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7409   Register OpReg = MI.getOperand(1).getReg();
7410   auto ShiftAmt =
7411       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7412   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7413   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7414   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7415   MI.eraseFromParent();
7416   return Legalized;
7417 }
7418 
7419 LegalizerHelper::LegalizeResult
7420 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7421   // Expand %res = G_ABS %a into:
7422   // %v1 = G_CONSTANT 0
7423   // %v2 = G_SUB %v1, %a
7424   // %res = G_SMAX %a, %v2
7425   Register SrcReg = MI.getOperand(1).getReg();
7426   LLT Ty = MRI.getType(SrcReg);
7427   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7428   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7429   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7430   MI.eraseFromParent();
7431   return Legalized;
7432 }
7433 
7434 LegalizerHelper::LegalizeResult
7435 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
7436   Register SrcReg = MI.getOperand(1).getReg();
7437   LLT SrcTy = MRI.getType(SrcReg);
7438   LLT DstTy = MRI.getType(SrcReg);
7439 
7440   // The source could be a scalar if the IR type was <1 x sN>.
7441   if (SrcTy.isScalar()) {
7442     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
7443       return UnableToLegalize; // FIXME: handle extension.
7444     // This can be just a plain copy.
7445     Observer.changingInstr(MI);
7446     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
7447     Observer.changedInstr(MI);
7448     return Legalized;
7449   }
7450   return UnableToLegalize;;
7451 }
7452 
7453 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
7454   // On Darwin, -Os means optimize for size without hurting performance, so
7455   // only really optimize for size when -Oz (MinSize) is used.
7456   if (MF.getTarget().getTargetTriple().isOSDarwin())
7457     return MF.getFunction().hasMinSize();
7458   return MF.getFunction().hasOptSize();
7459 }
7460 
7461 // Returns a list of types to use for memory op lowering in MemOps. A partial
7462 // port of findOptimalMemOpLowering in TargetLowering.
7463 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
7464                                           unsigned Limit, const MemOp &Op,
7465                                           unsigned DstAS, unsigned SrcAS,
7466                                           const AttributeList &FuncAttributes,
7467                                           const TargetLowering &TLI) {
7468   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
7469     return false;
7470 
7471   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
7472 
7473   if (Ty == LLT()) {
7474     // Use the largest scalar type whose alignment constraints are satisfied.
7475     // We only need to check DstAlign here as SrcAlign is always greater or
7476     // equal to DstAlign (or zero).
7477     Ty = LLT::scalar(64);
7478     if (Op.isFixedDstAlign())
7479       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
7480              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
7481         Ty = LLT::scalar(Ty.getSizeInBytes());
7482     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
7483     // FIXME: check for the largest legal type we can load/store to.
7484   }
7485 
7486   unsigned NumMemOps = 0;
7487   uint64_t Size = Op.size();
7488   while (Size) {
7489     unsigned TySize = Ty.getSizeInBytes();
7490     while (TySize > Size) {
7491       // For now, only use non-vector load / store's for the left-over pieces.
7492       LLT NewTy = Ty;
7493       // FIXME: check for mem op safety and legality of the types. Not all of
7494       // SDAGisms map cleanly to GISel concepts.
7495       if (NewTy.isVector())
7496         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
7497       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
7498       unsigned NewTySize = NewTy.getSizeInBytes();
7499       assert(NewTySize > 0 && "Could not find appropriate type");
7500 
7501       // If the new LLT cannot cover all of the remaining bits, then consider
7502       // issuing a (or a pair of) unaligned and overlapping load / store.
7503       bool Fast;
7504       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
7505       MVT VT = getMVTForLLT(Ty);
7506       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
7507           TLI.allowsMisalignedMemoryAccesses(
7508               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
7509               MachineMemOperand::MONone, &Fast) &&
7510           Fast)
7511         TySize = Size;
7512       else {
7513         Ty = NewTy;
7514         TySize = NewTySize;
7515       }
7516     }
7517 
7518     if (++NumMemOps > Limit)
7519       return false;
7520 
7521     MemOps.push_back(Ty);
7522     Size -= TySize;
7523   }
7524 
7525   return true;
7526 }
7527 
7528 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
7529   if (Ty.isVector())
7530     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
7531                                 Ty.getNumElements());
7532   return IntegerType::get(C, Ty.getSizeInBits());
7533 }
7534 
7535 // Get a vectorized representation of the memset value operand, GISel edition.
7536 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
7537   MachineRegisterInfo &MRI = *MIB.getMRI();
7538   unsigned NumBits = Ty.getScalarSizeInBits();
7539   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
7540   if (!Ty.isVector() && ValVRegAndVal) {
7541     APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
7542     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
7543     return MIB.buildConstant(Ty, SplatVal).getReg(0);
7544   }
7545 
7546   // Extend the byte value to the larger type, and then multiply by a magic
7547   // value 0x010101... in order to replicate it across every byte.
7548   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
7549   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
7550     return MIB.buildConstant(Ty, 0).getReg(0);
7551   }
7552 
7553   LLT ExtType = Ty.getScalarType();
7554   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
7555   if (NumBits > 8) {
7556     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
7557     auto MagicMI = MIB.buildConstant(ExtType, Magic);
7558     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
7559   }
7560 
7561   // For vector types create a G_BUILD_VECTOR.
7562   if (Ty.isVector())
7563     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
7564 
7565   return Val;
7566 }
7567 
7568 LegalizerHelper::LegalizeResult
7569 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
7570                              uint64_t KnownLen, Align Alignment,
7571                              bool IsVolatile) {
7572   auto &MF = *MI.getParent()->getParent();
7573   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7574   auto &DL = MF.getDataLayout();
7575   LLVMContext &C = MF.getFunction().getContext();
7576 
7577   assert(KnownLen != 0 && "Have a zero length memset length!");
7578 
7579   bool DstAlignCanChange = false;
7580   MachineFrameInfo &MFI = MF.getFrameInfo();
7581   bool OptSize = shouldLowerMemFuncForSize(MF);
7582 
7583   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7584   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7585     DstAlignCanChange = true;
7586 
7587   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
7588   std::vector<LLT> MemOps;
7589 
7590   const auto &DstMMO = **MI.memoperands_begin();
7591   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7592 
7593   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
7594   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
7595 
7596   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
7597                                      MemOp::Set(KnownLen, DstAlignCanChange,
7598                                                 Alignment,
7599                                                 /*IsZeroMemset=*/IsZeroVal,
7600                                                 /*IsVolatile=*/IsVolatile),
7601                                      DstPtrInfo.getAddrSpace(), ~0u,
7602                                      MF.getFunction().getAttributes(), TLI))
7603     return UnableToLegalize;
7604 
7605   if (DstAlignCanChange) {
7606     // Get an estimate of the type from the LLT.
7607     Type *IRTy = getTypeForLLT(MemOps[0], C);
7608     Align NewAlign = DL.getABITypeAlign(IRTy);
7609     if (NewAlign > Alignment) {
7610       Alignment = NewAlign;
7611       unsigned FI = FIDef->getOperand(1).getIndex();
7612       // Give the stack frame object a larger alignment if needed.
7613       if (MFI.getObjectAlign(FI) < Alignment)
7614         MFI.setObjectAlignment(FI, Alignment);
7615     }
7616   }
7617 
7618   MachineIRBuilder MIB(MI);
7619   // Find the largest store and generate the bit pattern for it.
7620   LLT LargestTy = MemOps[0];
7621   for (unsigned i = 1; i < MemOps.size(); i++)
7622     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
7623       LargestTy = MemOps[i];
7624 
7625   // The memset stored value is always defined as an s8, so in order to make it
7626   // work with larger store types we need to repeat the bit pattern across the
7627   // wider type.
7628   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
7629 
7630   if (!MemSetValue)
7631     return UnableToLegalize;
7632 
7633   // Generate the stores. For each store type in the list, we generate the
7634   // matching store of that type to the destination address.
7635   LLT PtrTy = MRI.getType(Dst);
7636   unsigned DstOff = 0;
7637   unsigned Size = KnownLen;
7638   for (unsigned I = 0; I < MemOps.size(); I++) {
7639     LLT Ty = MemOps[I];
7640     unsigned TySize = Ty.getSizeInBytes();
7641     if (TySize > Size) {
7642       // Issuing an unaligned load / store pair that overlaps with the previous
7643       // pair. Adjust the offset accordingly.
7644       assert(I == MemOps.size() - 1 && I != 0);
7645       DstOff -= TySize - Size;
7646     }
7647 
7648     // If this store is smaller than the largest store see whether we can get
7649     // the smaller value for free with a truncate.
7650     Register Value = MemSetValue;
7651     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
7652       MVT VT = getMVTForLLT(Ty);
7653       MVT LargestVT = getMVTForLLT(LargestTy);
7654       if (!LargestTy.isVector() && !Ty.isVector() &&
7655           TLI.isTruncateFree(LargestVT, VT))
7656         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
7657       else
7658         Value = getMemsetValue(Val, Ty, MIB);
7659       if (!Value)
7660         return UnableToLegalize;
7661     }
7662 
7663     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
7664 
7665     Register Ptr = Dst;
7666     if (DstOff != 0) {
7667       auto Offset =
7668           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
7669       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7670     }
7671 
7672     MIB.buildStore(Value, Ptr, *StoreMMO);
7673     DstOff += Ty.getSizeInBytes();
7674     Size -= TySize;
7675   }
7676 
7677   MI.eraseFromParent();
7678   return Legalized;
7679 }
7680 
7681 LegalizerHelper::LegalizeResult
7682 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
7683   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7684 
7685   Register Dst = MI.getOperand(0).getReg();
7686   Register Src = MI.getOperand(1).getReg();
7687   Register Len = MI.getOperand(2).getReg();
7688 
7689   const auto *MMOIt = MI.memoperands_begin();
7690   const MachineMemOperand *MemOp = *MMOIt;
7691   bool IsVolatile = MemOp->isVolatile();
7692 
7693   // See if this is a constant length copy
7694   auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
7695   // FIXME: support dynamically sized G_MEMCPY_INLINE
7696   assert(LenVRegAndVal.hasValue() &&
7697          "inline memcpy with dynamic size is not yet supported");
7698   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7699   if (KnownLen == 0) {
7700     MI.eraseFromParent();
7701     return Legalized;
7702   }
7703 
7704   const auto &DstMMO = **MI.memoperands_begin();
7705   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7706   Align DstAlign = DstMMO.getBaseAlign();
7707   Align SrcAlign = SrcMMO.getBaseAlign();
7708 
7709   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7710                            IsVolatile);
7711 }
7712 
7713 LegalizerHelper::LegalizeResult
7714 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
7715                                    uint64_t KnownLen, Align DstAlign,
7716                                    Align SrcAlign, bool IsVolatile) {
7717   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7718   return lowerMemcpy(MI, Dst, Src, KnownLen,
7719                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
7720                      IsVolatile);
7721 }
7722 
7723 LegalizerHelper::LegalizeResult
7724 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
7725                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
7726                              Align SrcAlign, bool IsVolatile) {
7727   auto &MF = *MI.getParent()->getParent();
7728   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7729   auto &DL = MF.getDataLayout();
7730   LLVMContext &C = MF.getFunction().getContext();
7731 
7732   assert(KnownLen != 0 && "Have a zero length memcpy length!");
7733 
7734   bool DstAlignCanChange = false;
7735   MachineFrameInfo &MFI = MF.getFrameInfo();
7736   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7737 
7738   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7739   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7740     DstAlignCanChange = true;
7741 
7742   // FIXME: infer better src pointer alignment like SelectionDAG does here.
7743   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
7744   // if the memcpy is in a tail call position.
7745 
7746   std::vector<LLT> MemOps;
7747 
7748   const auto &DstMMO = **MI.memoperands_begin();
7749   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7750   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7751   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7752 
7753   if (!findGISelOptimalMemOpLowering(
7754           MemOps, Limit,
7755           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7756                       IsVolatile),
7757           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7758           MF.getFunction().getAttributes(), TLI))
7759     return UnableToLegalize;
7760 
7761   if (DstAlignCanChange) {
7762     // Get an estimate of the type from the LLT.
7763     Type *IRTy = getTypeForLLT(MemOps[0], C);
7764     Align NewAlign = DL.getABITypeAlign(IRTy);
7765 
7766     // Don't promote to an alignment that would require dynamic stack
7767     // realignment.
7768     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7769     if (!TRI->hasStackRealignment(MF))
7770       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7771         NewAlign = NewAlign / 2;
7772 
7773     if (NewAlign > Alignment) {
7774       Alignment = NewAlign;
7775       unsigned FI = FIDef->getOperand(1).getIndex();
7776       // Give the stack frame object a larger alignment if needed.
7777       if (MFI.getObjectAlign(FI) < Alignment)
7778         MFI.setObjectAlignment(FI, Alignment);
7779     }
7780   }
7781 
7782   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
7783 
7784   MachineIRBuilder MIB(MI);
7785   // Now we need to emit a pair of load and stores for each of the types we've
7786   // collected. I.e. for each type, generate a load from the source pointer of
7787   // that type width, and then generate a corresponding store to the dest buffer
7788   // of that value loaded. This can result in a sequence of loads and stores
7789   // mixed types, depending on what the target specifies as good types to use.
7790   unsigned CurrOffset = 0;
7791   LLT PtrTy = MRI.getType(Src);
7792   unsigned Size = KnownLen;
7793   for (auto CopyTy : MemOps) {
7794     // Issuing an unaligned load / store pair  that overlaps with the previous
7795     // pair. Adjust the offset accordingly.
7796     if (CopyTy.getSizeInBytes() > Size)
7797       CurrOffset -= CopyTy.getSizeInBytes() - Size;
7798 
7799     // Construct MMOs for the accesses.
7800     auto *LoadMMO =
7801         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7802     auto *StoreMMO =
7803         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7804 
7805     // Create the load.
7806     Register LoadPtr = Src;
7807     Register Offset;
7808     if (CurrOffset != 0) {
7809       Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
7810                    .getReg(0);
7811       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
7812     }
7813     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
7814 
7815     // Create the store.
7816     Register StorePtr =
7817         CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7818     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
7819     CurrOffset += CopyTy.getSizeInBytes();
7820     Size -= CopyTy.getSizeInBytes();
7821   }
7822 
7823   MI.eraseFromParent();
7824   return Legalized;
7825 }
7826 
7827 LegalizerHelper::LegalizeResult
7828 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
7829                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
7830                               bool IsVolatile) {
7831   auto &MF = *MI.getParent()->getParent();
7832   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7833   auto &DL = MF.getDataLayout();
7834   LLVMContext &C = MF.getFunction().getContext();
7835 
7836   assert(KnownLen != 0 && "Have a zero length memmove length!");
7837 
7838   bool DstAlignCanChange = false;
7839   MachineFrameInfo &MFI = MF.getFrameInfo();
7840   bool OptSize = shouldLowerMemFuncForSize(MF);
7841   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7842 
7843   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7844   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7845     DstAlignCanChange = true;
7846 
7847   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
7848   std::vector<LLT> MemOps;
7849 
7850   const auto &DstMMO = **MI.memoperands_begin();
7851   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7852   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7853   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7854 
7855   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
7856   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
7857   // same thing here.
7858   if (!findGISelOptimalMemOpLowering(
7859           MemOps, Limit,
7860           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7861                       /*IsVolatile*/ true),
7862           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7863           MF.getFunction().getAttributes(), TLI))
7864     return UnableToLegalize;
7865 
7866   if (DstAlignCanChange) {
7867     // Get an estimate of the type from the LLT.
7868     Type *IRTy = getTypeForLLT(MemOps[0], C);
7869     Align NewAlign = DL.getABITypeAlign(IRTy);
7870 
7871     // Don't promote to an alignment that would require dynamic stack
7872     // realignment.
7873     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7874     if (!TRI->hasStackRealignment(MF))
7875       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7876         NewAlign = NewAlign / 2;
7877 
7878     if (NewAlign > Alignment) {
7879       Alignment = NewAlign;
7880       unsigned FI = FIDef->getOperand(1).getIndex();
7881       // Give the stack frame object a larger alignment if needed.
7882       if (MFI.getObjectAlign(FI) < Alignment)
7883         MFI.setObjectAlignment(FI, Alignment);
7884     }
7885   }
7886 
7887   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
7888 
7889   MachineIRBuilder MIB(MI);
7890   // Memmove requires that we perform the loads first before issuing the stores.
7891   // Apart from that, this loop is pretty much doing the same thing as the
7892   // memcpy codegen function.
7893   unsigned CurrOffset = 0;
7894   LLT PtrTy = MRI.getType(Src);
7895   SmallVector<Register, 16> LoadVals;
7896   for (auto CopyTy : MemOps) {
7897     // Construct MMO for the load.
7898     auto *LoadMMO =
7899         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7900 
7901     // Create the load.
7902     Register LoadPtr = Src;
7903     if (CurrOffset != 0) {
7904       auto Offset =
7905           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
7906       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
7907     }
7908     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
7909     CurrOffset += CopyTy.getSizeInBytes();
7910   }
7911 
7912   CurrOffset = 0;
7913   for (unsigned I = 0; I < MemOps.size(); ++I) {
7914     LLT CopyTy = MemOps[I];
7915     // Now store the values loaded.
7916     auto *StoreMMO =
7917         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7918 
7919     Register StorePtr = Dst;
7920     if (CurrOffset != 0) {
7921       auto Offset =
7922           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
7923       StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7924     }
7925     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
7926     CurrOffset += CopyTy.getSizeInBytes();
7927   }
7928   MI.eraseFromParent();
7929   return Legalized;
7930 }
7931 
7932 LegalizerHelper::LegalizeResult
7933 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
7934   const unsigned Opc = MI.getOpcode();
7935   // This combine is fairly complex so it's not written with a separate
7936   // matcher function.
7937   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
7938           Opc == TargetOpcode::G_MEMSET) &&
7939          "Expected memcpy like instruction");
7940 
7941   auto MMOIt = MI.memoperands_begin();
7942   const MachineMemOperand *MemOp = *MMOIt;
7943 
7944   Align DstAlign = MemOp->getBaseAlign();
7945   Align SrcAlign;
7946   Register Dst = MI.getOperand(0).getReg();
7947   Register Src = MI.getOperand(1).getReg();
7948   Register Len = MI.getOperand(2).getReg();
7949 
7950   if (Opc != TargetOpcode::G_MEMSET) {
7951     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
7952     MemOp = *(++MMOIt);
7953     SrcAlign = MemOp->getBaseAlign();
7954   }
7955 
7956   // See if this is a constant length copy
7957   auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
7958   if (!LenVRegAndVal)
7959     return UnableToLegalize;
7960   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7961 
7962   if (KnownLen == 0) {
7963     MI.eraseFromParent();
7964     return Legalized;
7965   }
7966 
7967   bool IsVolatile = MemOp->isVolatile();
7968   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
7969     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7970                              IsVolatile);
7971 
7972   // Don't try to optimize volatile.
7973   if (IsVolatile)
7974     return UnableToLegalize;
7975 
7976   if (MaxLen && KnownLen > MaxLen)
7977     return UnableToLegalize;
7978 
7979   if (Opc == TargetOpcode::G_MEMCPY) {
7980     auto &MF = *MI.getParent()->getParent();
7981     const auto &TLI = *MF.getSubtarget().getTargetLowering();
7982     bool OptSize = shouldLowerMemFuncForSize(MF);
7983     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
7984     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
7985                        IsVolatile);
7986   }
7987   if (Opc == TargetOpcode::G_MEMMOVE)
7988     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
7989   if (Opc == TargetOpcode::G_MEMSET)
7990     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
7991   return UnableToLegalize;
7992 }
7993