1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/TargetFrameLowering.h"
24 #include "llvm/CodeGen/TargetInstrInfo.h"
25 #include "llvm/CodeGen/TargetLowering.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/Target/TargetMachine.h"
33 
34 #define DEBUG_TYPE "legalizer"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace MIPatternMatch;
39 
40 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
41 ///
42 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
43 /// with any leftover piece as type \p LeftoverTy
44 ///
45 /// Returns -1 in the first element of the pair if the breakdown is not
46 /// satisfiable.
47 static std::pair<int, int>
48 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
49   assert(!LeftoverTy.isValid() && "this is an out argument");
50 
51   unsigned Size = OrigTy.getSizeInBits();
52   unsigned NarrowSize = NarrowTy.getSizeInBits();
53   unsigned NumParts = Size / NarrowSize;
54   unsigned LeftoverSize = Size - NumParts * NarrowSize;
55   assert(Size > NarrowSize);
56 
57   if (LeftoverSize == 0)
58     return {NumParts, 0};
59 
60   if (NarrowTy.isVector()) {
61     unsigned EltSize = OrigTy.getScalarSizeInBits();
62     if (LeftoverSize % EltSize != 0)
63       return {-1, -1};
64     LeftoverTy = LLT::scalarOrVector(
65         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
66   } else {
67     LeftoverTy = LLT::scalar(LeftoverSize);
68   }
69 
70   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
71   return std::make_pair(NumParts, NumLeftover);
72 }
73 
74 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
75 
76   if (!Ty.isScalar())
77     return nullptr;
78 
79   switch (Ty.getSizeInBits()) {
80   case 16:
81     return Type::getHalfTy(Ctx);
82   case 32:
83     return Type::getFloatTy(Ctx);
84   case 64:
85     return Type::getDoubleTy(Ctx);
86   case 80:
87     return Type::getX86_FP80Ty(Ctx);
88   case 128:
89     return Type::getFP128Ty(Ctx);
90   default:
91     return nullptr;
92   }
93 }
94 
95 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
96                                  GISelChangeObserver &Observer,
97                                  MachineIRBuilder &Builder)
98     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
99       LI(*MF.getSubtarget().getLegalizerInfo()),
100       TLI(*MF.getSubtarget().getTargetLowering()) { }
101 
102 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
103                                  GISelChangeObserver &Observer,
104                                  MachineIRBuilder &B)
105   : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
106     TLI(*MF.getSubtarget().getTargetLowering()) { }
107 
108 LegalizerHelper::LegalizeResult
109 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
110                                    LostDebugLocObserver &LocObserver) {
111   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
112 
113   MIRBuilder.setInstrAndDebugLoc(MI);
114 
115   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
116       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
117     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
118   auto Step = LI.getAction(MI, MRI);
119   switch (Step.Action) {
120   case Legal:
121     LLVM_DEBUG(dbgs() << ".. Already legal\n");
122     return AlreadyLegal;
123   case Libcall:
124     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
125     return libcall(MI, LocObserver);
126   case NarrowScalar:
127     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
128     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
129   case WidenScalar:
130     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
131     return widenScalar(MI, Step.TypeIdx, Step.NewType);
132   case Bitcast:
133     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
134     return bitcast(MI, Step.TypeIdx, Step.NewType);
135   case Lower:
136     LLVM_DEBUG(dbgs() << ".. Lower\n");
137     return lower(MI, Step.TypeIdx, Step.NewType);
138   case FewerElements:
139     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
140     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
141   case MoreElements:
142     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
143     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
144   case Custom:
145     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
146     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
147   default:
148     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
149     return UnableToLegalize;
150   }
151 }
152 
153 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
154                                    SmallVectorImpl<Register> &VRegs) {
155   for (int i = 0; i < NumParts; ++i)
156     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
157   MIRBuilder.buildUnmerge(VRegs, Reg);
158 }
159 
160 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
161                                    LLT MainTy, LLT &LeftoverTy,
162                                    SmallVectorImpl<Register> &VRegs,
163                                    SmallVectorImpl<Register> &LeftoverRegs) {
164   assert(!LeftoverTy.isValid() && "this is an out argument");
165 
166   unsigned RegSize = RegTy.getSizeInBits();
167   unsigned MainSize = MainTy.getSizeInBits();
168   unsigned NumParts = RegSize / MainSize;
169   unsigned LeftoverSize = RegSize - NumParts * MainSize;
170 
171   // Use an unmerge when possible.
172   if (LeftoverSize == 0) {
173     for (unsigned I = 0; I < NumParts; ++I)
174       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
175     MIRBuilder.buildUnmerge(VRegs, Reg);
176     return true;
177   }
178 
179   if (MainTy.isVector()) {
180     unsigned EltSize = MainTy.getScalarSizeInBits();
181     if (LeftoverSize % EltSize != 0)
182       return false;
183     LeftoverTy = LLT::scalarOrVector(
184         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
185   } else {
186     LeftoverTy = LLT::scalar(LeftoverSize);
187   }
188 
189   // For irregular sizes, extract the individual parts.
190   for (unsigned I = 0; I != NumParts; ++I) {
191     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
192     VRegs.push_back(NewReg);
193     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
194   }
195 
196   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
197        Offset += LeftoverSize) {
198     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
199     LeftoverRegs.push_back(NewReg);
200     MIRBuilder.buildExtract(NewReg, Reg, Offset);
201   }
202 
203   return true;
204 }
205 
206 void LegalizerHelper::insertParts(Register DstReg,
207                                   LLT ResultTy, LLT PartTy,
208                                   ArrayRef<Register> PartRegs,
209                                   LLT LeftoverTy,
210                                   ArrayRef<Register> LeftoverRegs) {
211   if (!LeftoverTy.isValid()) {
212     assert(LeftoverRegs.empty());
213 
214     if (!ResultTy.isVector()) {
215       MIRBuilder.buildMerge(DstReg, PartRegs);
216       return;
217     }
218 
219     if (PartTy.isVector())
220       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
221     else
222       MIRBuilder.buildBuildVector(DstReg, PartRegs);
223     return;
224   }
225 
226   SmallVector<Register> GCDRegs;
227   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
228   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
229     extractGCDType(GCDRegs, GCDTy, PartReg);
230   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
231   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
232 }
233 
234 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
235 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
236                               const MachineInstr &MI) {
237   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
238 
239   const int StartIdx = Regs.size();
240   const int NumResults = MI.getNumOperands() - 1;
241   Regs.resize(Regs.size() + NumResults);
242   for (int I = 0; I != NumResults; ++I)
243     Regs[StartIdx + I] = MI.getOperand(I).getReg();
244 }
245 
246 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
247                                      LLT GCDTy, Register SrcReg) {
248   LLT SrcTy = MRI.getType(SrcReg);
249   if (SrcTy == GCDTy) {
250     // If the source already evenly divides the result type, we don't need to do
251     // anything.
252     Parts.push_back(SrcReg);
253   } else {
254     // Need to split into common type sized pieces.
255     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
256     getUnmergeResults(Parts, *Unmerge);
257   }
258 }
259 
260 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
261                                     LLT NarrowTy, Register SrcReg) {
262   LLT SrcTy = MRI.getType(SrcReg);
263   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
264   extractGCDType(Parts, GCDTy, SrcReg);
265   return GCDTy;
266 }
267 
268 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
269                                          SmallVectorImpl<Register> &VRegs,
270                                          unsigned PadStrategy) {
271   LLT LCMTy = getLCMType(DstTy, NarrowTy);
272 
273   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
274   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
275   int NumOrigSrc = VRegs.size();
276 
277   Register PadReg;
278 
279   // Get a value we can use to pad the source value if the sources won't evenly
280   // cover the result type.
281   if (NumOrigSrc < NumParts * NumSubParts) {
282     if (PadStrategy == TargetOpcode::G_ZEXT)
283       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
284     else if (PadStrategy == TargetOpcode::G_ANYEXT)
285       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
286     else {
287       assert(PadStrategy == TargetOpcode::G_SEXT);
288 
289       // Shift the sign bit of the low register through the high register.
290       auto ShiftAmt =
291         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
292       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
293     }
294   }
295 
296   // Registers for the final merge to be produced.
297   SmallVector<Register, 4> Remerge(NumParts);
298 
299   // Registers needed for intermediate merges, which will be merged into a
300   // source for Remerge.
301   SmallVector<Register, 4> SubMerge(NumSubParts);
302 
303   // Once we've fully read off the end of the original source bits, we can reuse
304   // the same high bits for remaining padding elements.
305   Register AllPadReg;
306 
307   // Build merges to the LCM type to cover the original result type.
308   for (int I = 0; I != NumParts; ++I) {
309     bool AllMergePartsArePadding = true;
310 
311     // Build the requested merges to the requested type.
312     for (int J = 0; J != NumSubParts; ++J) {
313       int Idx = I * NumSubParts + J;
314       if (Idx >= NumOrigSrc) {
315         SubMerge[J] = PadReg;
316         continue;
317       }
318 
319       SubMerge[J] = VRegs[Idx];
320 
321       // There are meaningful bits here we can't reuse later.
322       AllMergePartsArePadding = false;
323     }
324 
325     // If we've filled up a complete piece with padding bits, we can directly
326     // emit the natural sized constant if applicable, rather than a merge of
327     // smaller constants.
328     if (AllMergePartsArePadding && !AllPadReg) {
329       if (PadStrategy == TargetOpcode::G_ANYEXT)
330         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
331       else if (PadStrategy == TargetOpcode::G_ZEXT)
332         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
333 
334       // If this is a sign extension, we can't materialize a trivial constant
335       // with the right type and have to produce a merge.
336     }
337 
338     if (AllPadReg) {
339       // Avoid creating additional instructions if we're just adding additional
340       // copies of padding bits.
341       Remerge[I] = AllPadReg;
342       continue;
343     }
344 
345     if (NumSubParts == 1)
346       Remerge[I] = SubMerge[0];
347     else
348       Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
349 
350     // In the sign extend padding case, re-use the first all-signbit merge.
351     if (AllMergePartsArePadding && !AllPadReg)
352       AllPadReg = Remerge[I];
353   }
354 
355   VRegs = std::move(Remerge);
356   return LCMTy;
357 }
358 
359 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
360                                                ArrayRef<Register> RemergeRegs) {
361   LLT DstTy = MRI.getType(DstReg);
362 
363   // Create the merge to the widened source, and extract the relevant bits into
364   // the result.
365 
366   if (DstTy == LCMTy) {
367     MIRBuilder.buildMerge(DstReg, RemergeRegs);
368     return;
369   }
370 
371   auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
372   if (DstTy.isScalar() && LCMTy.isScalar()) {
373     MIRBuilder.buildTrunc(DstReg, Remerge);
374     return;
375   }
376 
377   if (LCMTy.isVector()) {
378     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
379     SmallVector<Register, 8> UnmergeDefs(NumDefs);
380     UnmergeDefs[0] = DstReg;
381     for (unsigned I = 1; I != NumDefs; ++I)
382       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
383 
384     MIRBuilder.buildUnmerge(UnmergeDefs,
385                             MIRBuilder.buildMerge(LCMTy, RemergeRegs));
386     return;
387   }
388 
389   llvm_unreachable("unhandled case");
390 }
391 
392 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
393 #define RTLIBCASE_INT(LibcallPrefix)                                           \
394   do {                                                                         \
395     switch (Size) {                                                            \
396     case 32:                                                                   \
397       return RTLIB::LibcallPrefix##32;                                         \
398     case 64:                                                                   \
399       return RTLIB::LibcallPrefix##64;                                         \
400     case 128:                                                                  \
401       return RTLIB::LibcallPrefix##128;                                        \
402     default:                                                                   \
403       llvm_unreachable("unexpected size");                                     \
404     }                                                                          \
405   } while (0)
406 
407 #define RTLIBCASE(LibcallPrefix)                                               \
408   do {                                                                         \
409     switch (Size) {                                                            \
410     case 32:                                                                   \
411       return RTLIB::LibcallPrefix##32;                                         \
412     case 64:                                                                   \
413       return RTLIB::LibcallPrefix##64;                                         \
414     case 80:                                                                   \
415       return RTLIB::LibcallPrefix##80;                                         \
416     case 128:                                                                  \
417       return RTLIB::LibcallPrefix##128;                                        \
418     default:                                                                   \
419       llvm_unreachable("unexpected size");                                     \
420     }                                                                          \
421   } while (0)
422 
423   switch (Opcode) {
424   case TargetOpcode::G_SDIV:
425     RTLIBCASE_INT(SDIV_I);
426   case TargetOpcode::G_UDIV:
427     RTLIBCASE_INT(UDIV_I);
428   case TargetOpcode::G_SREM:
429     RTLIBCASE_INT(SREM_I);
430   case TargetOpcode::G_UREM:
431     RTLIBCASE_INT(UREM_I);
432   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
433     RTLIBCASE_INT(CTLZ_I);
434   case TargetOpcode::G_FADD:
435     RTLIBCASE(ADD_F);
436   case TargetOpcode::G_FSUB:
437     RTLIBCASE(SUB_F);
438   case TargetOpcode::G_FMUL:
439     RTLIBCASE(MUL_F);
440   case TargetOpcode::G_FDIV:
441     RTLIBCASE(DIV_F);
442   case TargetOpcode::G_FEXP:
443     RTLIBCASE(EXP_F);
444   case TargetOpcode::G_FEXP2:
445     RTLIBCASE(EXP2_F);
446   case TargetOpcode::G_FREM:
447     RTLIBCASE(REM_F);
448   case TargetOpcode::G_FPOW:
449     RTLIBCASE(POW_F);
450   case TargetOpcode::G_FMA:
451     RTLIBCASE(FMA_F);
452   case TargetOpcode::G_FSIN:
453     RTLIBCASE(SIN_F);
454   case TargetOpcode::G_FCOS:
455     RTLIBCASE(COS_F);
456   case TargetOpcode::G_FLOG10:
457     RTLIBCASE(LOG10_F);
458   case TargetOpcode::G_FLOG:
459     RTLIBCASE(LOG_F);
460   case TargetOpcode::G_FLOG2:
461     RTLIBCASE(LOG2_F);
462   case TargetOpcode::G_FCEIL:
463     RTLIBCASE(CEIL_F);
464   case TargetOpcode::G_FFLOOR:
465     RTLIBCASE(FLOOR_F);
466   case TargetOpcode::G_FMINNUM:
467     RTLIBCASE(FMIN_F);
468   case TargetOpcode::G_FMAXNUM:
469     RTLIBCASE(FMAX_F);
470   case TargetOpcode::G_FSQRT:
471     RTLIBCASE(SQRT_F);
472   case TargetOpcode::G_FRINT:
473     RTLIBCASE(RINT_F);
474   case TargetOpcode::G_FNEARBYINT:
475     RTLIBCASE(NEARBYINT_F);
476   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
477     RTLIBCASE(ROUNDEVEN_F);
478   }
479   llvm_unreachable("Unknown libcall function");
480 }
481 
482 /// True if an instruction is in tail position in its caller. Intended for
483 /// legalizing libcalls as tail calls when possible.
484 static bool isLibCallInTailPosition(MachineInstr &MI,
485                                     const TargetInstrInfo &TII,
486                                     MachineRegisterInfo &MRI) {
487   MachineBasicBlock &MBB = *MI.getParent();
488   const Function &F = MBB.getParent()->getFunction();
489 
490   // Conservatively require the attributes of the call to match those of
491   // the return. Ignore NoAlias and NonNull because they don't affect the
492   // call sequence.
493   AttributeList CallerAttrs = F.getAttributes();
494   if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
495           .removeAttribute(Attribute::NoAlias)
496           .removeAttribute(Attribute::NonNull)
497           .hasAttributes())
498     return false;
499 
500   // It's not safe to eliminate the sign / zero extension of the return value.
501   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
502       CallerAttrs.hasRetAttr(Attribute::SExt))
503     return false;
504 
505   // Only tail call if the following instruction is a standard return or if we
506   // have a `thisreturn` callee, and a sequence like:
507   //
508   //   G_MEMCPY %0, %1, %2
509   //   $x0 = COPY %0
510   //   RET_ReallyLR implicit $x0
511   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
512   if (Next != MBB.instr_end() && Next->isCopy()) {
513     switch (MI.getOpcode()) {
514     default:
515       llvm_unreachable("unsupported opcode");
516     case TargetOpcode::G_BZERO:
517       return false;
518     case TargetOpcode::G_MEMCPY:
519     case TargetOpcode::G_MEMMOVE:
520     case TargetOpcode::G_MEMSET:
521       break;
522     }
523 
524     Register VReg = MI.getOperand(0).getReg();
525     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
526       return false;
527 
528     Register PReg = Next->getOperand(0).getReg();
529     if (!PReg.isPhysical())
530       return false;
531 
532     auto Ret = next_nodbg(Next, MBB.instr_end());
533     if (Ret == MBB.instr_end() || !Ret->isReturn())
534       return false;
535 
536     if (Ret->getNumImplicitOperands() != 1)
537       return false;
538 
539     if (PReg != Ret->getOperand(0).getReg())
540       return false;
541 
542     // Skip over the COPY that we just validated.
543     Next = Ret;
544   }
545 
546   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
547     return false;
548 
549   return true;
550 }
551 
552 LegalizerHelper::LegalizeResult
553 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
554                     const CallLowering::ArgInfo &Result,
555                     ArrayRef<CallLowering::ArgInfo> Args,
556                     const CallingConv::ID CC) {
557   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
558 
559   CallLowering::CallLoweringInfo Info;
560   Info.CallConv = CC;
561   Info.Callee = MachineOperand::CreateES(Name);
562   Info.OrigRet = Result;
563   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
564   if (!CLI.lowerCall(MIRBuilder, Info))
565     return LegalizerHelper::UnableToLegalize;
566 
567   return LegalizerHelper::Legalized;
568 }
569 
570 LegalizerHelper::LegalizeResult
571 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
572                     const CallLowering::ArgInfo &Result,
573                     ArrayRef<CallLowering::ArgInfo> Args) {
574   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
575   const char *Name = TLI.getLibcallName(Libcall);
576   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
577   return createLibcall(MIRBuilder, Name, Result, Args, CC);
578 }
579 
580 // Useful for libcalls where all operands have the same type.
581 static LegalizerHelper::LegalizeResult
582 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
583               Type *OpType) {
584   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
585 
586   // FIXME: What does the original arg index mean here?
587   SmallVector<CallLowering::ArgInfo, 3> Args;
588   for (unsigned i = 1; i < MI.getNumOperands(); i++)
589     Args.push_back({MI.getOperand(i).getReg(), OpType, 0});
590   return createLibcall(MIRBuilder, Libcall,
591                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
592 }
593 
594 LegalizerHelper::LegalizeResult
595 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
596                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
597   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
598 
599   SmallVector<CallLowering::ArgInfo, 3> Args;
600   // Add all the args, except for the last which is an imm denoting 'tail'.
601   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
602     Register Reg = MI.getOperand(i).getReg();
603 
604     // Need derive an IR type for call lowering.
605     LLT OpLLT = MRI.getType(Reg);
606     Type *OpTy = nullptr;
607     if (OpLLT.isPointer())
608       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
609     else
610       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
611     Args.push_back({Reg, OpTy, 0});
612   }
613 
614   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
615   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
616   RTLIB::Libcall RTLibcall;
617   unsigned Opc = MI.getOpcode();
618   switch (Opc) {
619   case TargetOpcode::G_BZERO:
620     RTLibcall = RTLIB::BZERO;
621     break;
622   case TargetOpcode::G_MEMCPY:
623     RTLibcall = RTLIB::MEMCPY;
624     Args[0].Flags[0].setReturned();
625     break;
626   case TargetOpcode::G_MEMMOVE:
627     RTLibcall = RTLIB::MEMMOVE;
628     Args[0].Flags[0].setReturned();
629     break;
630   case TargetOpcode::G_MEMSET:
631     RTLibcall = RTLIB::MEMSET;
632     Args[0].Flags[0].setReturned();
633     break;
634   default:
635     llvm_unreachable("unsupported opcode");
636   }
637   const char *Name = TLI.getLibcallName(RTLibcall);
638 
639   // Unsupported libcall on the target.
640   if (!Name) {
641     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
642                       << MIRBuilder.getTII().getName(Opc) << "\n");
643     return LegalizerHelper::UnableToLegalize;
644   }
645 
646   CallLowering::CallLoweringInfo Info;
647   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
648   Info.Callee = MachineOperand::CreateES(Name);
649   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
650   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
651                     isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
652 
653   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
654   if (!CLI.lowerCall(MIRBuilder, Info))
655     return LegalizerHelper::UnableToLegalize;
656 
657   if (Info.LoweredTailCall) {
658     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
659 
660     // Check debug locations before removing the return.
661     LocObserver.checkpoint(true);
662 
663     // We must have a return following the call (or debug insts) to get past
664     // isLibCallInTailPosition.
665     do {
666       MachineInstr *Next = MI.getNextNode();
667       assert(Next &&
668              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
669              "Expected instr following MI to be return or debug inst?");
670       // We lowered a tail call, so the call is now the return from the block.
671       // Delete the old return.
672       Next->eraseFromParent();
673     } while (MI.getNextNode());
674 
675     // We expect to lose the debug location from the return.
676     LocObserver.checkpoint(false);
677   }
678 
679   return LegalizerHelper::Legalized;
680 }
681 
682 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
683                                        Type *FromType) {
684   auto ToMVT = MVT::getVT(ToType);
685   auto FromMVT = MVT::getVT(FromType);
686 
687   switch (Opcode) {
688   case TargetOpcode::G_FPEXT:
689     return RTLIB::getFPEXT(FromMVT, ToMVT);
690   case TargetOpcode::G_FPTRUNC:
691     return RTLIB::getFPROUND(FromMVT, ToMVT);
692   case TargetOpcode::G_FPTOSI:
693     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
694   case TargetOpcode::G_FPTOUI:
695     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
696   case TargetOpcode::G_SITOFP:
697     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
698   case TargetOpcode::G_UITOFP:
699     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
700   }
701   llvm_unreachable("Unsupported libcall function");
702 }
703 
704 static LegalizerHelper::LegalizeResult
705 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
706                   Type *FromType) {
707   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
708   return createLibcall(MIRBuilder, Libcall,
709                        {MI.getOperand(0).getReg(), ToType, 0},
710                        {{MI.getOperand(1).getReg(), FromType, 0}});
711 }
712 
713 LegalizerHelper::LegalizeResult
714 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
715   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
716   unsigned Size = LLTy.getSizeInBits();
717   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
718 
719   switch (MI.getOpcode()) {
720   default:
721     return UnableToLegalize;
722   case TargetOpcode::G_SDIV:
723   case TargetOpcode::G_UDIV:
724   case TargetOpcode::G_SREM:
725   case TargetOpcode::G_UREM:
726   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
727     Type *HLTy = IntegerType::get(Ctx, Size);
728     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
729     if (Status != Legalized)
730       return Status;
731     break;
732   }
733   case TargetOpcode::G_FADD:
734   case TargetOpcode::G_FSUB:
735   case TargetOpcode::G_FMUL:
736   case TargetOpcode::G_FDIV:
737   case TargetOpcode::G_FMA:
738   case TargetOpcode::G_FPOW:
739   case TargetOpcode::G_FREM:
740   case TargetOpcode::G_FCOS:
741   case TargetOpcode::G_FSIN:
742   case TargetOpcode::G_FLOG10:
743   case TargetOpcode::G_FLOG:
744   case TargetOpcode::G_FLOG2:
745   case TargetOpcode::G_FEXP:
746   case TargetOpcode::G_FEXP2:
747   case TargetOpcode::G_FCEIL:
748   case TargetOpcode::G_FFLOOR:
749   case TargetOpcode::G_FMINNUM:
750   case TargetOpcode::G_FMAXNUM:
751   case TargetOpcode::G_FSQRT:
752   case TargetOpcode::G_FRINT:
753   case TargetOpcode::G_FNEARBYINT:
754   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
755     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
756     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
757       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
758       return UnableToLegalize;
759     }
760     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
761     if (Status != Legalized)
762       return Status;
763     break;
764   }
765   case TargetOpcode::G_FPEXT:
766   case TargetOpcode::G_FPTRUNC: {
767     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
768     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
769     if (!FromTy || !ToTy)
770       return UnableToLegalize;
771     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
772     if (Status != Legalized)
773       return Status;
774     break;
775   }
776   case TargetOpcode::G_FPTOSI:
777   case TargetOpcode::G_FPTOUI: {
778     // FIXME: Support other types
779     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
780     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
781     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
782       return UnableToLegalize;
783     LegalizeResult Status = conversionLibcall(
784         MI, MIRBuilder,
785         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
786         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
787     if (Status != Legalized)
788       return Status;
789     break;
790   }
791   case TargetOpcode::G_SITOFP:
792   case TargetOpcode::G_UITOFP: {
793     // FIXME: Support other types
794     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
795     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
796     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
797       return UnableToLegalize;
798     LegalizeResult Status = conversionLibcall(
799         MI, MIRBuilder,
800         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
801         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
802     if (Status != Legalized)
803       return Status;
804     break;
805   }
806   case TargetOpcode::G_BZERO:
807   case TargetOpcode::G_MEMCPY:
808   case TargetOpcode::G_MEMMOVE:
809   case TargetOpcode::G_MEMSET: {
810     LegalizeResult Result =
811         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
812     if (Result != Legalized)
813       return Result;
814     MI.eraseFromParent();
815     return Result;
816   }
817   }
818 
819   MI.eraseFromParent();
820   return Legalized;
821 }
822 
823 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
824                                                               unsigned TypeIdx,
825                                                               LLT NarrowTy) {
826   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
827   uint64_t NarrowSize = NarrowTy.getSizeInBits();
828 
829   switch (MI.getOpcode()) {
830   default:
831     return UnableToLegalize;
832   case TargetOpcode::G_IMPLICIT_DEF: {
833     Register DstReg = MI.getOperand(0).getReg();
834     LLT DstTy = MRI.getType(DstReg);
835 
836     // If SizeOp0 is not an exact multiple of NarrowSize, emit
837     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
838     // FIXME: Although this would also be legal for the general case, it causes
839     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
840     //  combines not being hit). This seems to be a problem related to the
841     //  artifact combiner.
842     if (SizeOp0 % NarrowSize != 0) {
843       LLT ImplicitTy = NarrowTy;
844       if (DstTy.isVector())
845         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
846 
847       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
848       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
849 
850       MI.eraseFromParent();
851       return Legalized;
852     }
853 
854     int NumParts = SizeOp0 / NarrowSize;
855 
856     SmallVector<Register, 2> DstRegs;
857     for (int i = 0; i < NumParts; ++i)
858       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
859 
860     if (DstTy.isVector())
861       MIRBuilder.buildBuildVector(DstReg, DstRegs);
862     else
863       MIRBuilder.buildMerge(DstReg, DstRegs);
864     MI.eraseFromParent();
865     return Legalized;
866   }
867   case TargetOpcode::G_CONSTANT: {
868     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
869     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
870     unsigned TotalSize = Ty.getSizeInBits();
871     unsigned NarrowSize = NarrowTy.getSizeInBits();
872     int NumParts = TotalSize / NarrowSize;
873 
874     SmallVector<Register, 4> PartRegs;
875     for (int I = 0; I != NumParts; ++I) {
876       unsigned Offset = I * NarrowSize;
877       auto K = MIRBuilder.buildConstant(NarrowTy,
878                                         Val.lshr(Offset).trunc(NarrowSize));
879       PartRegs.push_back(K.getReg(0));
880     }
881 
882     LLT LeftoverTy;
883     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
884     SmallVector<Register, 1> LeftoverRegs;
885     if (LeftoverBits != 0) {
886       LeftoverTy = LLT::scalar(LeftoverBits);
887       auto K = MIRBuilder.buildConstant(
888         LeftoverTy,
889         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
890       LeftoverRegs.push_back(K.getReg(0));
891     }
892 
893     insertParts(MI.getOperand(0).getReg(),
894                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
895 
896     MI.eraseFromParent();
897     return Legalized;
898   }
899   case TargetOpcode::G_SEXT:
900   case TargetOpcode::G_ZEXT:
901   case TargetOpcode::G_ANYEXT:
902     return narrowScalarExt(MI, TypeIdx, NarrowTy);
903   case TargetOpcode::G_TRUNC: {
904     if (TypeIdx != 1)
905       return UnableToLegalize;
906 
907     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
908     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
909       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
910       return UnableToLegalize;
911     }
912 
913     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
914     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
915     MI.eraseFromParent();
916     return Legalized;
917   }
918 
919   case TargetOpcode::G_FREEZE:
920     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
921   case TargetOpcode::G_ADD:
922   case TargetOpcode::G_SUB:
923   case TargetOpcode::G_SADDO:
924   case TargetOpcode::G_SSUBO:
925   case TargetOpcode::G_SADDE:
926   case TargetOpcode::G_SSUBE:
927   case TargetOpcode::G_UADDO:
928   case TargetOpcode::G_USUBO:
929   case TargetOpcode::G_UADDE:
930   case TargetOpcode::G_USUBE:
931     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
932   case TargetOpcode::G_MUL:
933   case TargetOpcode::G_UMULH:
934     return narrowScalarMul(MI, NarrowTy);
935   case TargetOpcode::G_EXTRACT:
936     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
937   case TargetOpcode::G_INSERT:
938     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
939   case TargetOpcode::G_LOAD: {
940     auto &LoadMI = cast<GLoad>(MI);
941     Register DstReg = LoadMI.getDstReg();
942     LLT DstTy = MRI.getType(DstReg);
943     if (DstTy.isVector())
944       return UnableToLegalize;
945 
946     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
947       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
948       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
949       MIRBuilder.buildAnyExt(DstReg, TmpReg);
950       LoadMI.eraseFromParent();
951       return Legalized;
952     }
953 
954     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
955   }
956   case TargetOpcode::G_ZEXTLOAD:
957   case TargetOpcode::G_SEXTLOAD: {
958     auto &LoadMI = cast<GExtLoad>(MI);
959     Register DstReg = LoadMI.getDstReg();
960     Register PtrReg = LoadMI.getPointerReg();
961 
962     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
963     auto &MMO = LoadMI.getMMO();
964     unsigned MemSize = MMO.getSizeInBits();
965 
966     if (MemSize == NarrowSize) {
967       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
968     } else if (MemSize < NarrowSize) {
969       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
970     } else if (MemSize > NarrowSize) {
971       // FIXME: Need to split the load.
972       return UnableToLegalize;
973     }
974 
975     if (isa<GZExtLoad>(LoadMI))
976       MIRBuilder.buildZExt(DstReg, TmpReg);
977     else
978       MIRBuilder.buildSExt(DstReg, TmpReg);
979 
980     LoadMI.eraseFromParent();
981     return Legalized;
982   }
983   case TargetOpcode::G_STORE: {
984     auto &StoreMI = cast<GStore>(MI);
985 
986     Register SrcReg = StoreMI.getValueReg();
987     LLT SrcTy = MRI.getType(SrcReg);
988     if (SrcTy.isVector())
989       return UnableToLegalize;
990 
991     int NumParts = SizeOp0 / NarrowSize;
992     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
993     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
994     if (SrcTy.isVector() && LeftoverBits != 0)
995       return UnableToLegalize;
996 
997     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
998       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
999       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1000       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1001       StoreMI.eraseFromParent();
1002       return Legalized;
1003     }
1004 
1005     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1006   }
1007   case TargetOpcode::G_SELECT:
1008     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1009   case TargetOpcode::G_AND:
1010   case TargetOpcode::G_OR:
1011   case TargetOpcode::G_XOR: {
1012     // Legalize bitwise operation:
1013     // A = BinOp<Ty> B, C
1014     // into:
1015     // B1, ..., BN = G_UNMERGE_VALUES B
1016     // C1, ..., CN = G_UNMERGE_VALUES C
1017     // A1 = BinOp<Ty/N> B1, C2
1018     // ...
1019     // AN = BinOp<Ty/N> BN, CN
1020     // A = G_MERGE_VALUES A1, ..., AN
1021     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1022   }
1023   case TargetOpcode::G_SHL:
1024   case TargetOpcode::G_LSHR:
1025   case TargetOpcode::G_ASHR:
1026     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1027   case TargetOpcode::G_CTLZ:
1028   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1029   case TargetOpcode::G_CTTZ:
1030   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1031   case TargetOpcode::G_CTPOP:
1032     if (TypeIdx == 1)
1033       switch (MI.getOpcode()) {
1034       case TargetOpcode::G_CTLZ:
1035       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1036         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1037       case TargetOpcode::G_CTTZ:
1038       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1039         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1040       case TargetOpcode::G_CTPOP:
1041         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1042       default:
1043         return UnableToLegalize;
1044       }
1045 
1046     Observer.changingInstr(MI);
1047     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1048     Observer.changedInstr(MI);
1049     return Legalized;
1050   case TargetOpcode::G_INTTOPTR:
1051     if (TypeIdx != 1)
1052       return UnableToLegalize;
1053 
1054     Observer.changingInstr(MI);
1055     narrowScalarSrc(MI, NarrowTy, 1);
1056     Observer.changedInstr(MI);
1057     return Legalized;
1058   case TargetOpcode::G_PTRTOINT:
1059     if (TypeIdx != 0)
1060       return UnableToLegalize;
1061 
1062     Observer.changingInstr(MI);
1063     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1064     Observer.changedInstr(MI);
1065     return Legalized;
1066   case TargetOpcode::G_PHI: {
1067     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1068     // NarrowSize.
1069     if (SizeOp0 % NarrowSize != 0)
1070       return UnableToLegalize;
1071 
1072     unsigned NumParts = SizeOp0 / NarrowSize;
1073     SmallVector<Register, 2> DstRegs(NumParts);
1074     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1075     Observer.changingInstr(MI);
1076     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1077       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1078       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1079       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1080                    SrcRegs[i / 2]);
1081     }
1082     MachineBasicBlock &MBB = *MI.getParent();
1083     MIRBuilder.setInsertPt(MBB, MI);
1084     for (unsigned i = 0; i < NumParts; ++i) {
1085       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1086       MachineInstrBuilder MIB =
1087           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1088       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1089         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1090     }
1091     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1092     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1093     Observer.changedInstr(MI);
1094     MI.eraseFromParent();
1095     return Legalized;
1096   }
1097   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1098   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1099     if (TypeIdx != 2)
1100       return UnableToLegalize;
1101 
1102     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1103     Observer.changingInstr(MI);
1104     narrowScalarSrc(MI, NarrowTy, OpIdx);
1105     Observer.changedInstr(MI);
1106     return Legalized;
1107   }
1108   case TargetOpcode::G_ICMP: {
1109     Register LHS = MI.getOperand(2).getReg();
1110     LLT SrcTy = MRI.getType(LHS);
1111     uint64_t SrcSize = SrcTy.getSizeInBits();
1112     CmpInst::Predicate Pred =
1113         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1114 
1115     // TODO: Handle the non-equality case for weird sizes.
1116     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1117       return UnableToLegalize;
1118 
1119     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1120     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1121     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1122                       LHSLeftoverRegs))
1123       return UnableToLegalize;
1124 
1125     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1126     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1127     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1128                       RHSPartRegs, RHSLeftoverRegs))
1129       return UnableToLegalize;
1130 
1131     // We now have the LHS and RHS of the compare split into narrow-type
1132     // registers, plus potentially some leftover type.
1133     Register Dst = MI.getOperand(0).getReg();
1134     LLT ResTy = MRI.getType(Dst);
1135     if (ICmpInst::isEquality(Pred)) {
1136       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1137       // them together. For each equal part, the result should be all 0s. For
1138       // each non-equal part, we'll get at least one 1.
1139       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1140       SmallVector<Register, 4> Xors;
1141       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1142         auto LHS = std::get<0>(LHSAndRHS);
1143         auto RHS = std::get<1>(LHSAndRHS);
1144         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1145         Xors.push_back(Xor);
1146       }
1147 
1148       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1149       // to the desired narrow type so that we can OR them together later.
1150       SmallVector<Register, 4> WidenedXors;
1151       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1152         auto LHS = std::get<0>(LHSAndRHS);
1153         auto RHS = std::get<1>(LHSAndRHS);
1154         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1155         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1156         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1157                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1158         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1159       }
1160 
1161       // Now, for each part we broke up, we know if they are equal/not equal
1162       // based off the G_XOR. We can OR these all together and compare against
1163       // 0 to get the result.
1164       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1165       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1166       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1167         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1168       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1169     } else {
1170       // TODO: Handle non-power-of-two types.
1171       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1172       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1173       Register LHSL = LHSPartRegs[0];
1174       Register LHSH = LHSPartRegs[1];
1175       Register RHSL = RHSPartRegs[0];
1176       Register RHSH = RHSPartRegs[1];
1177       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1178       MachineInstrBuilder CmpHEQ =
1179           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1180       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1181           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1182       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1183     }
1184     MI.eraseFromParent();
1185     return Legalized;
1186   }
1187   case TargetOpcode::G_SEXT_INREG: {
1188     if (TypeIdx != 0)
1189       return UnableToLegalize;
1190 
1191     int64_t SizeInBits = MI.getOperand(2).getImm();
1192 
1193     // So long as the new type has more bits than the bits we're extending we
1194     // don't need to break it apart.
1195     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1196       Observer.changingInstr(MI);
1197       // We don't lose any non-extension bits by truncating the src and
1198       // sign-extending the dst.
1199       MachineOperand &MO1 = MI.getOperand(1);
1200       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1201       MO1.setReg(TruncMIB.getReg(0));
1202 
1203       MachineOperand &MO2 = MI.getOperand(0);
1204       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1205       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1206       MIRBuilder.buildSExt(MO2, DstExt);
1207       MO2.setReg(DstExt);
1208       Observer.changedInstr(MI);
1209       return Legalized;
1210     }
1211 
1212     // Break it apart. Components below the extension point are unmodified. The
1213     // component containing the extension point becomes a narrower SEXT_INREG.
1214     // Components above it are ashr'd from the component containing the
1215     // extension point.
1216     if (SizeOp0 % NarrowSize != 0)
1217       return UnableToLegalize;
1218     int NumParts = SizeOp0 / NarrowSize;
1219 
1220     // List the registers where the destination will be scattered.
1221     SmallVector<Register, 2> DstRegs;
1222     // List the registers where the source will be split.
1223     SmallVector<Register, 2> SrcRegs;
1224 
1225     // Create all the temporary registers.
1226     for (int i = 0; i < NumParts; ++i) {
1227       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1228 
1229       SrcRegs.push_back(SrcReg);
1230     }
1231 
1232     // Explode the big arguments into smaller chunks.
1233     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1234 
1235     Register AshrCstReg =
1236         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1237             .getReg(0);
1238     Register FullExtensionReg = 0;
1239     Register PartialExtensionReg = 0;
1240 
1241     // Do the operation on each small part.
1242     for (int i = 0; i < NumParts; ++i) {
1243       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1244         DstRegs.push_back(SrcRegs[i]);
1245       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1246         assert(PartialExtensionReg &&
1247                "Expected to visit partial extension before full");
1248         if (FullExtensionReg) {
1249           DstRegs.push_back(FullExtensionReg);
1250           continue;
1251         }
1252         DstRegs.push_back(
1253             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1254                 .getReg(0));
1255         FullExtensionReg = DstRegs.back();
1256       } else {
1257         DstRegs.push_back(
1258             MIRBuilder
1259                 .buildInstr(
1260                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1261                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1262                 .getReg(0));
1263         PartialExtensionReg = DstRegs.back();
1264       }
1265     }
1266 
1267     // Gather the destination registers into the final destination.
1268     Register DstReg = MI.getOperand(0).getReg();
1269     MIRBuilder.buildMerge(DstReg, DstRegs);
1270     MI.eraseFromParent();
1271     return Legalized;
1272   }
1273   case TargetOpcode::G_BSWAP:
1274   case TargetOpcode::G_BITREVERSE: {
1275     if (SizeOp0 % NarrowSize != 0)
1276       return UnableToLegalize;
1277 
1278     Observer.changingInstr(MI);
1279     SmallVector<Register, 2> SrcRegs, DstRegs;
1280     unsigned NumParts = SizeOp0 / NarrowSize;
1281     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1282 
1283     for (unsigned i = 0; i < NumParts; ++i) {
1284       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1285                                            {SrcRegs[NumParts - 1 - i]});
1286       DstRegs.push_back(DstPart.getReg(0));
1287     }
1288 
1289     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1290 
1291     Observer.changedInstr(MI);
1292     MI.eraseFromParent();
1293     return Legalized;
1294   }
1295   case TargetOpcode::G_PTR_ADD:
1296   case TargetOpcode::G_PTRMASK: {
1297     if (TypeIdx != 1)
1298       return UnableToLegalize;
1299     Observer.changingInstr(MI);
1300     narrowScalarSrc(MI, NarrowTy, 2);
1301     Observer.changedInstr(MI);
1302     return Legalized;
1303   }
1304   case TargetOpcode::G_FPTOUI:
1305   case TargetOpcode::G_FPTOSI:
1306     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1307   case TargetOpcode::G_FPEXT:
1308     if (TypeIdx != 0)
1309       return UnableToLegalize;
1310     Observer.changingInstr(MI);
1311     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1312     Observer.changedInstr(MI);
1313     return Legalized;
1314   }
1315 }
1316 
1317 Register LegalizerHelper::coerceToScalar(Register Val) {
1318   LLT Ty = MRI.getType(Val);
1319   if (Ty.isScalar())
1320     return Val;
1321 
1322   const DataLayout &DL = MIRBuilder.getDataLayout();
1323   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1324   if (Ty.isPointer()) {
1325     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1326       return Register();
1327     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1328   }
1329 
1330   Register NewVal = Val;
1331 
1332   assert(Ty.isVector());
1333   LLT EltTy = Ty.getElementType();
1334   if (EltTy.isPointer())
1335     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1336   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1337 }
1338 
1339 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1340                                      unsigned OpIdx, unsigned ExtOpcode) {
1341   MachineOperand &MO = MI.getOperand(OpIdx);
1342   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1343   MO.setReg(ExtB.getReg(0));
1344 }
1345 
1346 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1347                                       unsigned OpIdx) {
1348   MachineOperand &MO = MI.getOperand(OpIdx);
1349   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1350   MO.setReg(ExtB.getReg(0));
1351 }
1352 
1353 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1354                                      unsigned OpIdx, unsigned TruncOpcode) {
1355   MachineOperand &MO = MI.getOperand(OpIdx);
1356   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1357   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1358   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1359   MO.setReg(DstExt);
1360 }
1361 
1362 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1363                                       unsigned OpIdx, unsigned ExtOpcode) {
1364   MachineOperand &MO = MI.getOperand(OpIdx);
1365   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1366   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1367   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1368   MO.setReg(DstTrunc);
1369 }
1370 
1371 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1372                                             unsigned OpIdx) {
1373   MachineOperand &MO = MI.getOperand(OpIdx);
1374   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1375   MO.setReg(widenWithUnmerge(WideTy, MO.getReg()));
1376 }
1377 
1378 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1379                                             unsigned OpIdx) {
1380   MachineOperand &MO = MI.getOperand(OpIdx);
1381 
1382   LLT OldTy = MRI.getType(MO.getReg());
1383   unsigned OldElts = OldTy.getNumElements();
1384   unsigned NewElts = MoreTy.getNumElements();
1385 
1386   unsigned NumParts = NewElts / OldElts;
1387 
1388   // Use concat_vectors if the result is a multiple of the number of elements.
1389   if (NumParts * OldElts == NewElts) {
1390     SmallVector<Register, 8> Parts;
1391     Parts.push_back(MO.getReg());
1392 
1393     Register ImpDef = MIRBuilder.buildUndef(OldTy).getReg(0);
1394     for (unsigned I = 1; I != NumParts; ++I)
1395       Parts.push_back(ImpDef);
1396 
1397     auto Concat = MIRBuilder.buildConcatVectors(MoreTy, Parts);
1398     MO.setReg(Concat.getReg(0));
1399     return;
1400   }
1401 
1402   Register MoreReg = MRI.createGenericVirtualRegister(MoreTy);
1403   Register ImpDef = MIRBuilder.buildUndef(MoreTy).getReg(0);
1404   MIRBuilder.buildInsert(MoreReg, ImpDef, MO.getReg(), 0);
1405   MO.setReg(MoreReg);
1406 }
1407 
1408 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1409   MachineOperand &Op = MI.getOperand(OpIdx);
1410   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1411 }
1412 
1413 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1414   MachineOperand &MO = MI.getOperand(OpIdx);
1415   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1416   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1417   MIRBuilder.buildBitcast(MO, CastDst);
1418   MO.setReg(CastDst);
1419 }
1420 
1421 LegalizerHelper::LegalizeResult
1422 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1423                                         LLT WideTy) {
1424   if (TypeIdx != 1)
1425     return UnableToLegalize;
1426 
1427   Register DstReg = MI.getOperand(0).getReg();
1428   LLT DstTy = MRI.getType(DstReg);
1429   if (DstTy.isVector())
1430     return UnableToLegalize;
1431 
1432   Register Src1 = MI.getOperand(1).getReg();
1433   LLT SrcTy = MRI.getType(Src1);
1434   const int DstSize = DstTy.getSizeInBits();
1435   const int SrcSize = SrcTy.getSizeInBits();
1436   const int WideSize = WideTy.getSizeInBits();
1437   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1438 
1439   unsigned NumOps = MI.getNumOperands();
1440   unsigned NumSrc = MI.getNumOperands() - 1;
1441   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1442 
1443   if (WideSize >= DstSize) {
1444     // Directly pack the bits in the target type.
1445     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1446 
1447     for (unsigned I = 2; I != NumOps; ++I) {
1448       const unsigned Offset = (I - 1) * PartSize;
1449 
1450       Register SrcReg = MI.getOperand(I).getReg();
1451       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1452 
1453       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1454 
1455       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1456         MRI.createGenericVirtualRegister(WideTy);
1457 
1458       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1459       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1460       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1461       ResultReg = NextResult;
1462     }
1463 
1464     if (WideSize > DstSize)
1465       MIRBuilder.buildTrunc(DstReg, ResultReg);
1466     else if (DstTy.isPointer())
1467       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1468 
1469     MI.eraseFromParent();
1470     return Legalized;
1471   }
1472 
1473   // Unmerge the original values to the GCD type, and recombine to the next
1474   // multiple greater than the original type.
1475   //
1476   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1477   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1478   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1479   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1480   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1481   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1482   // %12:_(s12) = G_MERGE_VALUES %10, %11
1483   //
1484   // Padding with undef if necessary:
1485   //
1486   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1487   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1488   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1489   // %7:_(s2) = G_IMPLICIT_DEF
1490   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1491   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1492   // %10:_(s12) = G_MERGE_VALUES %8, %9
1493 
1494   const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1495   LLT GCDTy = LLT::scalar(GCD);
1496 
1497   SmallVector<Register, 8> Parts;
1498   SmallVector<Register, 8> NewMergeRegs;
1499   SmallVector<Register, 8> Unmerges;
1500   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1501 
1502   // Decompose the original operands if they don't evenly divide.
1503   for (int I = 1, E = MI.getNumOperands(); I != E; ++I) {
1504     Register SrcReg = MI.getOperand(I).getReg();
1505     if (GCD == SrcSize) {
1506       Unmerges.push_back(SrcReg);
1507     } else {
1508       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1509       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1510         Unmerges.push_back(Unmerge.getReg(J));
1511     }
1512   }
1513 
1514   // Pad with undef to the next size that is a multiple of the requested size.
1515   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1516     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1517     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1518       Unmerges.push_back(UndefReg);
1519   }
1520 
1521   const int PartsPerGCD = WideSize / GCD;
1522 
1523   // Build merges of each piece.
1524   ArrayRef<Register> Slicer(Unmerges);
1525   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1526     auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1527     NewMergeRegs.push_back(Merge.getReg(0));
1528   }
1529 
1530   // A truncate may be necessary if the requested type doesn't evenly divide the
1531   // original result type.
1532   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1533     MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1534   } else {
1535     auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1536     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1537   }
1538 
1539   MI.eraseFromParent();
1540   return Legalized;
1541 }
1542 
1543 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
1544   Register WideReg = MRI.createGenericVirtualRegister(WideTy);
1545   LLT OrigTy = MRI.getType(OrigReg);
1546   LLT LCMTy = getLCMType(WideTy, OrigTy);
1547 
1548   const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
1549   const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
1550 
1551   Register UnmergeSrc = WideReg;
1552 
1553   // Create a merge to the LCM type, padding with undef
1554   // %0:_(<3 x s32>) = G_FOO => <4 x s32>
1555   // =>
1556   // %1:_(<4 x s32>) = G_FOO
1557   // %2:_(<4 x s32>) = G_IMPLICIT_DEF
1558   // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
1559   // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
1560   if (NumMergeParts > 1) {
1561     Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
1562     SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
1563     MergeParts[0] = WideReg;
1564     UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
1565   }
1566 
1567   // Unmerge to the original register and pad with dead defs.
1568   SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
1569   UnmergeResults[0] = OrigReg;
1570   for (int I = 1; I != NumUnmergeParts; ++I)
1571     UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
1572 
1573   MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
1574   return WideReg;
1575 }
1576 
1577 LegalizerHelper::LegalizeResult
1578 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1579                                           LLT WideTy) {
1580   if (TypeIdx != 0)
1581     return UnableToLegalize;
1582 
1583   int NumDst = MI.getNumOperands() - 1;
1584   Register SrcReg = MI.getOperand(NumDst).getReg();
1585   LLT SrcTy = MRI.getType(SrcReg);
1586   if (SrcTy.isVector())
1587     return UnableToLegalize;
1588 
1589   Register Dst0Reg = MI.getOperand(0).getReg();
1590   LLT DstTy = MRI.getType(Dst0Reg);
1591   if (!DstTy.isScalar())
1592     return UnableToLegalize;
1593 
1594   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1595     if (SrcTy.isPointer()) {
1596       const DataLayout &DL = MIRBuilder.getDataLayout();
1597       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1598         LLVM_DEBUG(
1599             dbgs() << "Not casting non-integral address space integer\n");
1600         return UnableToLegalize;
1601       }
1602 
1603       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1604       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1605     }
1606 
1607     // Widen SrcTy to WideTy. This does not affect the result, but since the
1608     // user requested this size, it is probably better handled than SrcTy and
1609     // should reduce the total number of legalization artifacts
1610     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1611       SrcTy = WideTy;
1612       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1613     }
1614 
1615     // Theres no unmerge type to target. Directly extract the bits from the
1616     // source type
1617     unsigned DstSize = DstTy.getSizeInBits();
1618 
1619     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1620     for (int I = 1; I != NumDst; ++I) {
1621       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1622       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1623       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1624     }
1625 
1626     MI.eraseFromParent();
1627     return Legalized;
1628   }
1629 
1630   // Extend the source to a wider type.
1631   LLT LCMTy = getLCMType(SrcTy, WideTy);
1632 
1633   Register WideSrc = SrcReg;
1634   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1635     // TODO: If this is an integral address space, cast to integer and anyext.
1636     if (SrcTy.isPointer()) {
1637       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1638       return UnableToLegalize;
1639     }
1640 
1641     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1642   }
1643 
1644   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1645 
1646   // Create a sequence of unmerges and merges to the original results. Since we
1647   // may have widened the source, we will need to pad the results with dead defs
1648   // to cover the source register.
1649   // e.g. widen s48 to s64:
1650   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1651   //
1652   // =>
1653   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1654   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1655   //  ; unpack to GCD type, with extra dead defs
1656   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1657   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1658   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1659   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1660   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1661   const LLT GCDTy = getGCDType(WideTy, DstTy);
1662   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1663   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1664 
1665   // Directly unmerge to the destination without going through a GCD type
1666   // if possible
1667   if (PartsPerRemerge == 1) {
1668     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1669 
1670     for (int I = 0; I != NumUnmerge; ++I) {
1671       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1672 
1673       for (int J = 0; J != PartsPerUnmerge; ++J) {
1674         int Idx = I * PartsPerUnmerge + J;
1675         if (Idx < NumDst)
1676           MIB.addDef(MI.getOperand(Idx).getReg());
1677         else {
1678           // Create dead def for excess components.
1679           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1680         }
1681       }
1682 
1683       MIB.addUse(Unmerge.getReg(I));
1684     }
1685   } else {
1686     SmallVector<Register, 16> Parts;
1687     for (int J = 0; J != NumUnmerge; ++J)
1688       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1689 
1690     SmallVector<Register, 8> RemergeParts;
1691     for (int I = 0; I != NumDst; ++I) {
1692       for (int J = 0; J < PartsPerRemerge; ++J) {
1693         const int Idx = I * PartsPerRemerge + J;
1694         RemergeParts.emplace_back(Parts[Idx]);
1695       }
1696 
1697       MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1698       RemergeParts.clear();
1699     }
1700   }
1701 
1702   MI.eraseFromParent();
1703   return Legalized;
1704 }
1705 
1706 LegalizerHelper::LegalizeResult
1707 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1708                                     LLT WideTy) {
1709   Register DstReg = MI.getOperand(0).getReg();
1710   Register SrcReg = MI.getOperand(1).getReg();
1711   LLT SrcTy = MRI.getType(SrcReg);
1712 
1713   LLT DstTy = MRI.getType(DstReg);
1714   unsigned Offset = MI.getOperand(2).getImm();
1715 
1716   if (TypeIdx == 0) {
1717     if (SrcTy.isVector() || DstTy.isVector())
1718       return UnableToLegalize;
1719 
1720     SrcOp Src(SrcReg);
1721     if (SrcTy.isPointer()) {
1722       // Extracts from pointers can be handled only if they are really just
1723       // simple integers.
1724       const DataLayout &DL = MIRBuilder.getDataLayout();
1725       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1726         return UnableToLegalize;
1727 
1728       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1729       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1730       SrcTy = SrcAsIntTy;
1731     }
1732 
1733     if (DstTy.isPointer())
1734       return UnableToLegalize;
1735 
1736     if (Offset == 0) {
1737       // Avoid a shift in the degenerate case.
1738       MIRBuilder.buildTrunc(DstReg,
1739                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1740       MI.eraseFromParent();
1741       return Legalized;
1742     }
1743 
1744     // Do a shift in the source type.
1745     LLT ShiftTy = SrcTy;
1746     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1747       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1748       ShiftTy = WideTy;
1749     }
1750 
1751     auto LShr = MIRBuilder.buildLShr(
1752       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1753     MIRBuilder.buildTrunc(DstReg, LShr);
1754     MI.eraseFromParent();
1755     return Legalized;
1756   }
1757 
1758   if (SrcTy.isScalar()) {
1759     Observer.changingInstr(MI);
1760     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1761     Observer.changedInstr(MI);
1762     return Legalized;
1763   }
1764 
1765   if (!SrcTy.isVector())
1766     return UnableToLegalize;
1767 
1768   if (DstTy != SrcTy.getElementType())
1769     return UnableToLegalize;
1770 
1771   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1772     return UnableToLegalize;
1773 
1774   Observer.changingInstr(MI);
1775   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1776 
1777   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1778                           Offset);
1779   widenScalarDst(MI, WideTy.getScalarType(), 0);
1780   Observer.changedInstr(MI);
1781   return Legalized;
1782 }
1783 
1784 LegalizerHelper::LegalizeResult
1785 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1786                                    LLT WideTy) {
1787   if (TypeIdx != 0 || WideTy.isVector())
1788     return UnableToLegalize;
1789   Observer.changingInstr(MI);
1790   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1791   widenScalarDst(MI, WideTy);
1792   Observer.changedInstr(MI);
1793   return Legalized;
1794 }
1795 
1796 LegalizerHelper::LegalizeResult
1797 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1798                                            LLT WideTy) {
1799   if (TypeIdx == 1)
1800     return UnableToLegalize; // TODO
1801 
1802   unsigned Opcode;
1803   unsigned ExtOpcode;
1804   Optional<Register> CarryIn = None;
1805   switch (MI.getOpcode()) {
1806   default:
1807     llvm_unreachable("Unexpected opcode!");
1808   case TargetOpcode::G_SADDO:
1809     Opcode = TargetOpcode::G_ADD;
1810     ExtOpcode = TargetOpcode::G_SEXT;
1811     break;
1812   case TargetOpcode::G_SSUBO:
1813     Opcode = TargetOpcode::G_SUB;
1814     ExtOpcode = TargetOpcode::G_SEXT;
1815     break;
1816   case TargetOpcode::G_UADDO:
1817     Opcode = TargetOpcode::G_ADD;
1818     ExtOpcode = TargetOpcode::G_ZEXT;
1819     break;
1820   case TargetOpcode::G_USUBO:
1821     Opcode = TargetOpcode::G_SUB;
1822     ExtOpcode = TargetOpcode::G_ZEXT;
1823     break;
1824   case TargetOpcode::G_SADDE:
1825     Opcode = TargetOpcode::G_UADDE;
1826     ExtOpcode = TargetOpcode::G_SEXT;
1827     CarryIn = MI.getOperand(4).getReg();
1828     break;
1829   case TargetOpcode::G_SSUBE:
1830     Opcode = TargetOpcode::G_USUBE;
1831     ExtOpcode = TargetOpcode::G_SEXT;
1832     CarryIn = MI.getOperand(4).getReg();
1833     break;
1834   case TargetOpcode::G_UADDE:
1835     Opcode = TargetOpcode::G_UADDE;
1836     ExtOpcode = TargetOpcode::G_ZEXT;
1837     CarryIn = MI.getOperand(4).getReg();
1838     break;
1839   case TargetOpcode::G_USUBE:
1840     Opcode = TargetOpcode::G_USUBE;
1841     ExtOpcode = TargetOpcode::G_ZEXT;
1842     CarryIn = MI.getOperand(4).getReg();
1843     break;
1844   }
1845 
1846   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1847   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1848   // Do the arithmetic in the larger type.
1849   Register NewOp;
1850   if (CarryIn) {
1851     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1852     NewOp = MIRBuilder
1853                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1854                             {LHSExt, RHSExt, *CarryIn})
1855                 .getReg(0);
1856   } else {
1857     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1858   }
1859   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1860   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1861   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1862   // There is no overflow if the ExtOp is the same as NewOp.
1863   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1864   // Now trunc the NewOp to the original result.
1865   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1866   MI.eraseFromParent();
1867   return Legalized;
1868 }
1869 
1870 LegalizerHelper::LegalizeResult
1871 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1872                                          LLT WideTy) {
1873   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1874                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1875                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1876   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1877                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1878   // We can convert this to:
1879   //   1. Any extend iN to iM
1880   //   2. SHL by M-N
1881   //   3. [US][ADD|SUB|SHL]SAT
1882   //   4. L/ASHR by M-N
1883   //
1884   // It may be more efficient to lower this to a min and a max operation in
1885   // the higher precision arithmetic if the promoted operation isn't legal,
1886   // but this decision is up to the target's lowering request.
1887   Register DstReg = MI.getOperand(0).getReg();
1888 
1889   unsigned NewBits = WideTy.getScalarSizeInBits();
1890   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1891 
1892   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1893   // must not left shift the RHS to preserve the shift amount.
1894   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1895   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1896                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1897   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1898   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1899   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1900 
1901   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1902                                         {ShiftL, ShiftR}, MI.getFlags());
1903 
1904   // Use a shift that will preserve the number of sign bits when the trunc is
1905   // folded away.
1906   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1907                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1908 
1909   MIRBuilder.buildTrunc(DstReg, Result);
1910   MI.eraseFromParent();
1911   return Legalized;
1912 }
1913 
1914 LegalizerHelper::LegalizeResult
1915 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1916                                  LLT WideTy) {
1917   if (TypeIdx == 1)
1918     return UnableToLegalize;
1919 
1920   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1921   Register Result = MI.getOperand(0).getReg();
1922   Register OriginalOverflow = MI.getOperand(1).getReg();
1923   Register LHS = MI.getOperand(2).getReg();
1924   Register RHS = MI.getOperand(3).getReg();
1925   LLT SrcTy = MRI.getType(LHS);
1926   LLT OverflowTy = MRI.getType(OriginalOverflow);
1927   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1928 
1929   // To determine if the result overflowed in the larger type, we extend the
1930   // input to the larger type, do the multiply (checking if it overflows),
1931   // then also check the high bits of the result to see if overflow happened
1932   // there.
1933   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
1934   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
1935   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
1936 
1937   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
1938                                     {LeftOperand, RightOperand});
1939   auto Mul = Mulo->getOperand(0);
1940   MIRBuilder.buildTrunc(Result, Mul);
1941 
1942   MachineInstrBuilder ExtResult;
1943   // Overflow occurred if it occurred in the larger type, or if the high part
1944   // of the result does not zero/sign-extend the low part.  Check this second
1945   // possibility first.
1946   if (IsSigned) {
1947     // For signed, overflow occurred when the high part does not sign-extend
1948     // the low part.
1949     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
1950   } else {
1951     // Unsigned overflow occurred when the high part does not zero-extend the
1952     // low part.
1953     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
1954   }
1955 
1956   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
1957   // so we don't need to check the overflow result of larger type Mulo.
1958   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
1959     auto Overflow =
1960         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
1961     // Finally check if the multiplication in the larger type itself overflowed.
1962     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
1963   } else {
1964     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
1965   }
1966   MI.eraseFromParent();
1967   return Legalized;
1968 }
1969 
1970 LegalizerHelper::LegalizeResult
1971 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
1972   switch (MI.getOpcode()) {
1973   default:
1974     return UnableToLegalize;
1975   case TargetOpcode::G_ATOMICRMW_XCHG:
1976   case TargetOpcode::G_ATOMICRMW_ADD:
1977   case TargetOpcode::G_ATOMICRMW_SUB:
1978   case TargetOpcode::G_ATOMICRMW_AND:
1979   case TargetOpcode::G_ATOMICRMW_OR:
1980   case TargetOpcode::G_ATOMICRMW_XOR:
1981   case TargetOpcode::G_ATOMICRMW_MIN:
1982   case TargetOpcode::G_ATOMICRMW_MAX:
1983   case TargetOpcode::G_ATOMICRMW_UMIN:
1984   case TargetOpcode::G_ATOMICRMW_UMAX:
1985     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
1986     Observer.changingInstr(MI);
1987     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
1988     widenScalarDst(MI, WideTy, 0);
1989     Observer.changedInstr(MI);
1990     return Legalized;
1991   case TargetOpcode::G_ATOMIC_CMPXCHG:
1992     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
1993     Observer.changingInstr(MI);
1994     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
1995     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
1996     widenScalarDst(MI, WideTy, 0);
1997     Observer.changedInstr(MI);
1998     return Legalized;
1999   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2000     if (TypeIdx == 0) {
2001       Observer.changingInstr(MI);
2002       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2003       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2004       widenScalarDst(MI, WideTy, 0);
2005       Observer.changedInstr(MI);
2006       return Legalized;
2007     }
2008     assert(TypeIdx == 1 &&
2009            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2010     Observer.changingInstr(MI);
2011     widenScalarDst(MI, WideTy, 1);
2012     Observer.changedInstr(MI);
2013     return Legalized;
2014   case TargetOpcode::G_EXTRACT:
2015     return widenScalarExtract(MI, TypeIdx, WideTy);
2016   case TargetOpcode::G_INSERT:
2017     return widenScalarInsert(MI, TypeIdx, WideTy);
2018   case TargetOpcode::G_MERGE_VALUES:
2019     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2020   case TargetOpcode::G_UNMERGE_VALUES:
2021     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2022   case TargetOpcode::G_SADDO:
2023   case TargetOpcode::G_SSUBO:
2024   case TargetOpcode::G_UADDO:
2025   case TargetOpcode::G_USUBO:
2026   case TargetOpcode::G_SADDE:
2027   case TargetOpcode::G_SSUBE:
2028   case TargetOpcode::G_UADDE:
2029   case TargetOpcode::G_USUBE:
2030     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2031   case TargetOpcode::G_UMULO:
2032   case TargetOpcode::G_SMULO:
2033     return widenScalarMulo(MI, TypeIdx, WideTy);
2034   case TargetOpcode::G_SADDSAT:
2035   case TargetOpcode::G_SSUBSAT:
2036   case TargetOpcode::G_SSHLSAT:
2037   case TargetOpcode::G_UADDSAT:
2038   case TargetOpcode::G_USUBSAT:
2039   case TargetOpcode::G_USHLSAT:
2040     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2041   case TargetOpcode::G_CTTZ:
2042   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2043   case TargetOpcode::G_CTLZ:
2044   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2045   case TargetOpcode::G_CTPOP: {
2046     if (TypeIdx == 0) {
2047       Observer.changingInstr(MI);
2048       widenScalarDst(MI, WideTy, 0);
2049       Observer.changedInstr(MI);
2050       return Legalized;
2051     }
2052 
2053     Register SrcReg = MI.getOperand(1).getReg();
2054 
2055     // First extend the input.
2056     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2057                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2058                           ? TargetOpcode::G_ANYEXT
2059                           : TargetOpcode::G_ZEXT;
2060     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2061     LLT CurTy = MRI.getType(SrcReg);
2062     unsigned NewOpc = MI.getOpcode();
2063     if (NewOpc == TargetOpcode::G_CTTZ) {
2064       // The count is the same in the larger type except if the original
2065       // value was zero.  This can be handled by setting the bit just off
2066       // the top of the original type.
2067       auto TopBit =
2068           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2069       MIBSrc = MIRBuilder.buildOr(
2070         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2071       // Now we know the operand is non-zero, use the more relaxed opcode.
2072       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2073     }
2074 
2075     // Perform the operation at the larger size.
2076     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2077     // This is already the correct result for CTPOP and CTTZs
2078     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2079         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2080       // The correct result is NewOp - (Difference in widety and current ty).
2081       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2082       MIBNewOp = MIRBuilder.buildSub(
2083           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2084     }
2085 
2086     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2087     MI.eraseFromParent();
2088     return Legalized;
2089   }
2090   case TargetOpcode::G_BSWAP: {
2091     Observer.changingInstr(MI);
2092     Register DstReg = MI.getOperand(0).getReg();
2093 
2094     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2095     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2096     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2097     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2098 
2099     MI.getOperand(0).setReg(DstExt);
2100 
2101     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2102 
2103     LLT Ty = MRI.getType(DstReg);
2104     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2105     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2106     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2107 
2108     MIRBuilder.buildTrunc(DstReg, ShrReg);
2109     Observer.changedInstr(MI);
2110     return Legalized;
2111   }
2112   case TargetOpcode::G_BITREVERSE: {
2113     Observer.changingInstr(MI);
2114 
2115     Register DstReg = MI.getOperand(0).getReg();
2116     LLT Ty = MRI.getType(DstReg);
2117     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2118 
2119     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2120     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2121     MI.getOperand(0).setReg(DstExt);
2122     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2123 
2124     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2125     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2126     MIRBuilder.buildTrunc(DstReg, Shift);
2127     Observer.changedInstr(MI);
2128     return Legalized;
2129   }
2130   case TargetOpcode::G_FREEZE:
2131     Observer.changingInstr(MI);
2132     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2133     widenScalarDst(MI, WideTy);
2134     Observer.changedInstr(MI);
2135     return Legalized;
2136 
2137   case TargetOpcode::G_ABS:
2138     Observer.changingInstr(MI);
2139     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2140     widenScalarDst(MI, WideTy);
2141     Observer.changedInstr(MI);
2142     return Legalized;
2143 
2144   case TargetOpcode::G_ADD:
2145   case TargetOpcode::G_AND:
2146   case TargetOpcode::G_MUL:
2147   case TargetOpcode::G_OR:
2148   case TargetOpcode::G_XOR:
2149   case TargetOpcode::G_SUB:
2150     // Perform operation at larger width (any extension is fines here, high bits
2151     // don't affect the result) and then truncate the result back to the
2152     // original type.
2153     Observer.changingInstr(MI);
2154     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2155     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2156     widenScalarDst(MI, WideTy);
2157     Observer.changedInstr(MI);
2158     return Legalized;
2159 
2160   case TargetOpcode::G_SBFX:
2161   case TargetOpcode::G_UBFX:
2162     Observer.changingInstr(MI);
2163 
2164     if (TypeIdx == 0) {
2165       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2166       widenScalarDst(MI, WideTy);
2167     } else {
2168       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2169       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2170     }
2171 
2172     Observer.changedInstr(MI);
2173     return Legalized;
2174 
2175   case TargetOpcode::G_SHL:
2176     Observer.changingInstr(MI);
2177 
2178     if (TypeIdx == 0) {
2179       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2180       widenScalarDst(MI, WideTy);
2181     } else {
2182       assert(TypeIdx == 1);
2183       // The "number of bits to shift" operand must preserve its value as an
2184       // unsigned integer:
2185       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2186     }
2187 
2188     Observer.changedInstr(MI);
2189     return Legalized;
2190 
2191   case TargetOpcode::G_SDIV:
2192   case TargetOpcode::G_SREM:
2193   case TargetOpcode::G_SMIN:
2194   case TargetOpcode::G_SMAX:
2195     Observer.changingInstr(MI);
2196     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2197     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2198     widenScalarDst(MI, WideTy);
2199     Observer.changedInstr(MI);
2200     return Legalized;
2201 
2202   case TargetOpcode::G_SDIVREM:
2203     Observer.changingInstr(MI);
2204     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2205     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2206     widenScalarDst(MI, WideTy);
2207     widenScalarDst(MI, WideTy, 1);
2208     Observer.changedInstr(MI);
2209     return Legalized;
2210 
2211   case TargetOpcode::G_ASHR:
2212   case TargetOpcode::G_LSHR:
2213     Observer.changingInstr(MI);
2214 
2215     if (TypeIdx == 0) {
2216       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2217         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2218 
2219       widenScalarSrc(MI, WideTy, 1, CvtOp);
2220       widenScalarDst(MI, WideTy);
2221     } else {
2222       assert(TypeIdx == 1);
2223       // The "number of bits to shift" operand must preserve its value as an
2224       // unsigned integer:
2225       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2226     }
2227 
2228     Observer.changedInstr(MI);
2229     return Legalized;
2230   case TargetOpcode::G_UDIV:
2231   case TargetOpcode::G_UREM:
2232   case TargetOpcode::G_UMIN:
2233   case TargetOpcode::G_UMAX:
2234     Observer.changingInstr(MI);
2235     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2236     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2237     widenScalarDst(MI, WideTy);
2238     Observer.changedInstr(MI);
2239     return Legalized;
2240 
2241   case TargetOpcode::G_UDIVREM:
2242     Observer.changingInstr(MI);
2243     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2244     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2245     widenScalarDst(MI, WideTy);
2246     widenScalarDst(MI, WideTy, 1);
2247     Observer.changedInstr(MI);
2248     return Legalized;
2249 
2250   case TargetOpcode::G_SELECT:
2251     Observer.changingInstr(MI);
2252     if (TypeIdx == 0) {
2253       // Perform operation at larger width (any extension is fine here, high
2254       // bits don't affect the result) and then truncate the result back to the
2255       // original type.
2256       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2257       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2258       widenScalarDst(MI, WideTy);
2259     } else {
2260       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2261       // Explicit extension is required here since high bits affect the result.
2262       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2263     }
2264     Observer.changedInstr(MI);
2265     return Legalized;
2266 
2267   case TargetOpcode::G_FPTOSI:
2268   case TargetOpcode::G_FPTOUI:
2269     Observer.changingInstr(MI);
2270 
2271     if (TypeIdx == 0)
2272       widenScalarDst(MI, WideTy);
2273     else
2274       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2275 
2276     Observer.changedInstr(MI);
2277     return Legalized;
2278   case TargetOpcode::G_SITOFP:
2279     Observer.changingInstr(MI);
2280 
2281     if (TypeIdx == 0)
2282       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2283     else
2284       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2285 
2286     Observer.changedInstr(MI);
2287     return Legalized;
2288   case TargetOpcode::G_UITOFP:
2289     Observer.changingInstr(MI);
2290 
2291     if (TypeIdx == 0)
2292       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2293     else
2294       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2295 
2296     Observer.changedInstr(MI);
2297     return Legalized;
2298   case TargetOpcode::G_LOAD:
2299   case TargetOpcode::G_SEXTLOAD:
2300   case TargetOpcode::G_ZEXTLOAD:
2301     Observer.changingInstr(MI);
2302     widenScalarDst(MI, WideTy);
2303     Observer.changedInstr(MI);
2304     return Legalized;
2305 
2306   case TargetOpcode::G_STORE: {
2307     if (TypeIdx != 0)
2308       return UnableToLegalize;
2309 
2310     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2311     if (!Ty.isScalar())
2312       return UnableToLegalize;
2313 
2314     Observer.changingInstr(MI);
2315 
2316     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2317       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2318     widenScalarSrc(MI, WideTy, 0, ExtType);
2319 
2320     Observer.changedInstr(MI);
2321     return Legalized;
2322   }
2323   case TargetOpcode::G_CONSTANT: {
2324     MachineOperand &SrcMO = MI.getOperand(1);
2325     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2326     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2327         MRI.getType(MI.getOperand(0).getReg()));
2328     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2329             ExtOpc == TargetOpcode::G_ANYEXT) &&
2330            "Illegal Extend");
2331     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2332     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2333                            ? SrcVal.sext(WideTy.getSizeInBits())
2334                            : SrcVal.zext(WideTy.getSizeInBits());
2335     Observer.changingInstr(MI);
2336     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2337 
2338     widenScalarDst(MI, WideTy);
2339     Observer.changedInstr(MI);
2340     return Legalized;
2341   }
2342   case TargetOpcode::G_FCONSTANT: {
2343     MachineOperand &SrcMO = MI.getOperand(1);
2344     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2345     APFloat Val = SrcMO.getFPImm()->getValueAPF();
2346     bool LosesInfo;
2347     switch (WideTy.getSizeInBits()) {
2348     case 32:
2349       Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
2350                   &LosesInfo);
2351       break;
2352     case 64:
2353       Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
2354                   &LosesInfo);
2355       break;
2356     default:
2357       return UnableToLegalize;
2358     }
2359 
2360     assert(!LosesInfo && "extend should always be lossless");
2361 
2362     Observer.changingInstr(MI);
2363     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
2364 
2365     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2366     Observer.changedInstr(MI);
2367     return Legalized;
2368   }
2369   case TargetOpcode::G_IMPLICIT_DEF: {
2370     Observer.changingInstr(MI);
2371     widenScalarDst(MI, WideTy);
2372     Observer.changedInstr(MI);
2373     return Legalized;
2374   }
2375   case TargetOpcode::G_BRCOND:
2376     Observer.changingInstr(MI);
2377     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2378     Observer.changedInstr(MI);
2379     return Legalized;
2380 
2381   case TargetOpcode::G_FCMP:
2382     Observer.changingInstr(MI);
2383     if (TypeIdx == 0)
2384       widenScalarDst(MI, WideTy);
2385     else {
2386       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2387       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2388     }
2389     Observer.changedInstr(MI);
2390     return Legalized;
2391 
2392   case TargetOpcode::G_ICMP:
2393     Observer.changingInstr(MI);
2394     if (TypeIdx == 0)
2395       widenScalarDst(MI, WideTy);
2396     else {
2397       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2398                                MI.getOperand(1).getPredicate()))
2399                                ? TargetOpcode::G_SEXT
2400                                : TargetOpcode::G_ZEXT;
2401       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2402       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2403     }
2404     Observer.changedInstr(MI);
2405     return Legalized;
2406 
2407   case TargetOpcode::G_PTR_ADD:
2408     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2409     Observer.changingInstr(MI);
2410     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2411     Observer.changedInstr(MI);
2412     return Legalized;
2413 
2414   case TargetOpcode::G_PHI: {
2415     assert(TypeIdx == 0 && "Expecting only Idx 0");
2416 
2417     Observer.changingInstr(MI);
2418     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2419       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2420       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2421       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2422     }
2423 
2424     MachineBasicBlock &MBB = *MI.getParent();
2425     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2426     widenScalarDst(MI, WideTy);
2427     Observer.changedInstr(MI);
2428     return Legalized;
2429   }
2430   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2431     if (TypeIdx == 0) {
2432       Register VecReg = MI.getOperand(1).getReg();
2433       LLT VecTy = MRI.getType(VecReg);
2434       Observer.changingInstr(MI);
2435 
2436       widenScalarSrc(
2437           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2438           TargetOpcode::G_SEXT);
2439 
2440       widenScalarDst(MI, WideTy, 0);
2441       Observer.changedInstr(MI);
2442       return Legalized;
2443     }
2444 
2445     if (TypeIdx != 2)
2446       return UnableToLegalize;
2447     Observer.changingInstr(MI);
2448     // TODO: Probably should be zext
2449     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2450     Observer.changedInstr(MI);
2451     return Legalized;
2452   }
2453   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2454     if (TypeIdx == 1) {
2455       Observer.changingInstr(MI);
2456 
2457       Register VecReg = MI.getOperand(1).getReg();
2458       LLT VecTy = MRI.getType(VecReg);
2459       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2460 
2461       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2462       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2463       widenScalarDst(MI, WideVecTy, 0);
2464       Observer.changedInstr(MI);
2465       return Legalized;
2466     }
2467 
2468     if (TypeIdx == 2) {
2469       Observer.changingInstr(MI);
2470       // TODO: Probably should be zext
2471       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2472       Observer.changedInstr(MI);
2473       return Legalized;
2474     }
2475 
2476     return UnableToLegalize;
2477   }
2478   case TargetOpcode::G_FADD:
2479   case TargetOpcode::G_FMUL:
2480   case TargetOpcode::G_FSUB:
2481   case TargetOpcode::G_FMA:
2482   case TargetOpcode::G_FMAD:
2483   case TargetOpcode::G_FNEG:
2484   case TargetOpcode::G_FABS:
2485   case TargetOpcode::G_FCANONICALIZE:
2486   case TargetOpcode::G_FMINNUM:
2487   case TargetOpcode::G_FMAXNUM:
2488   case TargetOpcode::G_FMINNUM_IEEE:
2489   case TargetOpcode::G_FMAXNUM_IEEE:
2490   case TargetOpcode::G_FMINIMUM:
2491   case TargetOpcode::G_FMAXIMUM:
2492   case TargetOpcode::G_FDIV:
2493   case TargetOpcode::G_FREM:
2494   case TargetOpcode::G_FCEIL:
2495   case TargetOpcode::G_FFLOOR:
2496   case TargetOpcode::G_FCOS:
2497   case TargetOpcode::G_FSIN:
2498   case TargetOpcode::G_FLOG10:
2499   case TargetOpcode::G_FLOG:
2500   case TargetOpcode::G_FLOG2:
2501   case TargetOpcode::G_FRINT:
2502   case TargetOpcode::G_FNEARBYINT:
2503   case TargetOpcode::G_FSQRT:
2504   case TargetOpcode::G_FEXP:
2505   case TargetOpcode::G_FEXP2:
2506   case TargetOpcode::G_FPOW:
2507   case TargetOpcode::G_INTRINSIC_TRUNC:
2508   case TargetOpcode::G_INTRINSIC_ROUND:
2509   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2510     assert(TypeIdx == 0);
2511     Observer.changingInstr(MI);
2512 
2513     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2514       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2515 
2516     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2517     Observer.changedInstr(MI);
2518     return Legalized;
2519   case TargetOpcode::G_FPOWI: {
2520     if (TypeIdx != 0)
2521       return UnableToLegalize;
2522     Observer.changingInstr(MI);
2523     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2524     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2525     Observer.changedInstr(MI);
2526     return Legalized;
2527   }
2528   case TargetOpcode::G_INTTOPTR:
2529     if (TypeIdx != 1)
2530       return UnableToLegalize;
2531 
2532     Observer.changingInstr(MI);
2533     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2534     Observer.changedInstr(MI);
2535     return Legalized;
2536   case TargetOpcode::G_PTRTOINT:
2537     if (TypeIdx != 0)
2538       return UnableToLegalize;
2539 
2540     Observer.changingInstr(MI);
2541     widenScalarDst(MI, WideTy, 0);
2542     Observer.changedInstr(MI);
2543     return Legalized;
2544   case TargetOpcode::G_BUILD_VECTOR: {
2545     Observer.changingInstr(MI);
2546 
2547     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2548     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2549       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2550 
2551     // Avoid changing the result vector type if the source element type was
2552     // requested.
2553     if (TypeIdx == 1) {
2554       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2555     } else {
2556       widenScalarDst(MI, WideTy, 0);
2557     }
2558 
2559     Observer.changedInstr(MI);
2560     return Legalized;
2561   }
2562   case TargetOpcode::G_SEXT_INREG:
2563     if (TypeIdx != 0)
2564       return UnableToLegalize;
2565 
2566     Observer.changingInstr(MI);
2567     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2568     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2569     Observer.changedInstr(MI);
2570     return Legalized;
2571   case TargetOpcode::G_PTRMASK: {
2572     if (TypeIdx != 1)
2573       return UnableToLegalize;
2574     Observer.changingInstr(MI);
2575     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2576     Observer.changedInstr(MI);
2577     return Legalized;
2578   }
2579   }
2580 }
2581 
2582 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2583                              MachineIRBuilder &B, Register Src, LLT Ty) {
2584   auto Unmerge = B.buildUnmerge(Ty, Src);
2585   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2586     Pieces.push_back(Unmerge.getReg(I));
2587 }
2588 
2589 LegalizerHelper::LegalizeResult
2590 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2591   Register Dst = MI.getOperand(0).getReg();
2592   Register Src = MI.getOperand(1).getReg();
2593   LLT DstTy = MRI.getType(Dst);
2594   LLT SrcTy = MRI.getType(Src);
2595 
2596   if (SrcTy.isVector()) {
2597     LLT SrcEltTy = SrcTy.getElementType();
2598     SmallVector<Register, 8> SrcRegs;
2599 
2600     if (DstTy.isVector()) {
2601       int NumDstElt = DstTy.getNumElements();
2602       int NumSrcElt = SrcTy.getNumElements();
2603 
2604       LLT DstEltTy = DstTy.getElementType();
2605       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2606       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2607 
2608       // If there's an element size mismatch, insert intermediate casts to match
2609       // the result element type.
2610       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2611         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2612         //
2613         // =>
2614         //
2615         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2616         // %3:_(<2 x s8>) = G_BITCAST %2
2617         // %4:_(<2 x s8>) = G_BITCAST %3
2618         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2619         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2620         SrcPartTy = SrcEltTy;
2621       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2622         //
2623         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2624         //
2625         // =>
2626         //
2627         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2628         // %3:_(s16) = G_BITCAST %2
2629         // %4:_(s16) = G_BITCAST %3
2630         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2631         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2632         DstCastTy = DstEltTy;
2633       }
2634 
2635       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2636       for (Register &SrcReg : SrcRegs)
2637         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2638     } else
2639       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2640 
2641     MIRBuilder.buildMerge(Dst, SrcRegs);
2642     MI.eraseFromParent();
2643     return Legalized;
2644   }
2645 
2646   if (DstTy.isVector()) {
2647     SmallVector<Register, 8> SrcRegs;
2648     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2649     MIRBuilder.buildMerge(Dst, SrcRegs);
2650     MI.eraseFromParent();
2651     return Legalized;
2652   }
2653 
2654   return UnableToLegalize;
2655 }
2656 
2657 /// Figure out the bit offset into a register when coercing a vector index for
2658 /// the wide element type. This is only for the case when promoting vector to
2659 /// one with larger elements.
2660 //
2661 ///
2662 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2663 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2664 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2665                                                    Register Idx,
2666                                                    unsigned NewEltSize,
2667                                                    unsigned OldEltSize) {
2668   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2669   LLT IdxTy = B.getMRI()->getType(Idx);
2670 
2671   // Now figure out the amount we need to shift to get the target bits.
2672   auto OffsetMask = B.buildConstant(
2673       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
2674   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2675   return B.buildShl(IdxTy, OffsetIdx,
2676                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2677 }
2678 
2679 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2680 /// is casting to a vector with a smaller element size, perform multiple element
2681 /// extracts and merge the results. If this is coercing to a vector with larger
2682 /// elements, index the bitcasted vector and extract the target element with bit
2683 /// operations. This is intended to force the indexing in the native register
2684 /// size for architectures that can dynamically index the register file.
2685 LegalizerHelper::LegalizeResult
2686 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2687                                          LLT CastTy) {
2688   if (TypeIdx != 1)
2689     return UnableToLegalize;
2690 
2691   Register Dst = MI.getOperand(0).getReg();
2692   Register SrcVec = MI.getOperand(1).getReg();
2693   Register Idx = MI.getOperand(2).getReg();
2694   LLT SrcVecTy = MRI.getType(SrcVec);
2695   LLT IdxTy = MRI.getType(Idx);
2696 
2697   LLT SrcEltTy = SrcVecTy.getElementType();
2698   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2699   unsigned OldNumElts = SrcVecTy.getNumElements();
2700 
2701   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2702   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2703 
2704   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2705   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2706   if (NewNumElts > OldNumElts) {
2707     // Decreasing the vector element size
2708     //
2709     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2710     //  =>
2711     //  v4i32:castx = bitcast x:v2i64
2712     //
2713     // i64 = bitcast
2714     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2715     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2716     //
2717     if (NewNumElts % OldNumElts != 0)
2718       return UnableToLegalize;
2719 
2720     // Type of the intermediate result vector.
2721     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2722     LLT MidTy =
2723         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2724 
2725     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2726 
2727     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2728     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2729 
2730     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2731       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2732       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2733       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2734       NewOps[I] = Elt.getReg(0);
2735     }
2736 
2737     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2738     MIRBuilder.buildBitcast(Dst, NewVec);
2739     MI.eraseFromParent();
2740     return Legalized;
2741   }
2742 
2743   if (NewNumElts < OldNumElts) {
2744     if (NewEltSize % OldEltSize != 0)
2745       return UnableToLegalize;
2746 
2747     // This only depends on powers of 2 because we use bit tricks to figure out
2748     // the bit offset we need to shift to get the target element. A general
2749     // expansion could emit division/multiply.
2750     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2751       return UnableToLegalize;
2752 
2753     // Increasing the vector element size.
2754     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2755     //
2756     //   =>
2757     //
2758     // %cast = G_BITCAST %vec
2759     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2760     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2761     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2762     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2763     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2764     // %elt = G_TRUNC %elt_bits
2765 
2766     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2767     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2768 
2769     // Divide to get the index in the wider element type.
2770     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2771 
2772     Register WideElt = CastVec;
2773     if (CastTy.isVector()) {
2774       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2775                                                      ScaledIdx).getReg(0);
2776     }
2777 
2778     // Compute the bit offset into the register of the target element.
2779     Register OffsetBits = getBitcastWiderVectorElementOffset(
2780       MIRBuilder, Idx, NewEltSize, OldEltSize);
2781 
2782     // Shift the wide element to get the target element.
2783     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2784     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2785     MI.eraseFromParent();
2786     return Legalized;
2787   }
2788 
2789   return UnableToLegalize;
2790 }
2791 
2792 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2793 /// TargetReg, while preserving other bits in \p TargetReg.
2794 ///
2795 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2796 static Register buildBitFieldInsert(MachineIRBuilder &B,
2797                                     Register TargetReg, Register InsertReg,
2798                                     Register OffsetBits) {
2799   LLT TargetTy = B.getMRI()->getType(TargetReg);
2800   LLT InsertTy = B.getMRI()->getType(InsertReg);
2801   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2802   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2803 
2804   // Produce a bitmask of the value to insert
2805   auto EltMask = B.buildConstant(
2806     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2807                                    InsertTy.getSizeInBits()));
2808   // Shift it into position
2809   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2810   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2811 
2812   // Clear out the bits in the wide element
2813   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2814 
2815   // The value to insert has all zeros already, so stick it into the masked
2816   // wide element.
2817   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2818 }
2819 
2820 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2821 /// is increasing the element size, perform the indexing in the target element
2822 /// type, and use bit operations to insert at the element position. This is
2823 /// intended for architectures that can dynamically index the register file and
2824 /// want to force indexing in the native register size.
2825 LegalizerHelper::LegalizeResult
2826 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2827                                         LLT CastTy) {
2828   if (TypeIdx != 0)
2829     return UnableToLegalize;
2830 
2831   Register Dst = MI.getOperand(0).getReg();
2832   Register SrcVec = MI.getOperand(1).getReg();
2833   Register Val = MI.getOperand(2).getReg();
2834   Register Idx = MI.getOperand(3).getReg();
2835 
2836   LLT VecTy = MRI.getType(Dst);
2837   LLT IdxTy = MRI.getType(Idx);
2838 
2839   LLT VecEltTy = VecTy.getElementType();
2840   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2841   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2842   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2843 
2844   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2845   unsigned OldNumElts = VecTy.getNumElements();
2846 
2847   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2848   if (NewNumElts < OldNumElts) {
2849     if (NewEltSize % OldEltSize != 0)
2850       return UnableToLegalize;
2851 
2852     // This only depends on powers of 2 because we use bit tricks to figure out
2853     // the bit offset we need to shift to get the target element. A general
2854     // expansion could emit division/multiply.
2855     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2856       return UnableToLegalize;
2857 
2858     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2859     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2860 
2861     // Divide to get the index in the wider element type.
2862     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2863 
2864     Register ExtractedElt = CastVec;
2865     if (CastTy.isVector()) {
2866       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2867                                                           ScaledIdx).getReg(0);
2868     }
2869 
2870     // Compute the bit offset into the register of the target element.
2871     Register OffsetBits = getBitcastWiderVectorElementOffset(
2872       MIRBuilder, Idx, NewEltSize, OldEltSize);
2873 
2874     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2875                                                Val, OffsetBits);
2876     if (CastTy.isVector()) {
2877       InsertedElt = MIRBuilder.buildInsertVectorElement(
2878         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2879     }
2880 
2881     MIRBuilder.buildBitcast(Dst, InsertedElt);
2882     MI.eraseFromParent();
2883     return Legalized;
2884   }
2885 
2886   return UnableToLegalize;
2887 }
2888 
2889 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2890   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2891   Register DstReg = LoadMI.getDstReg();
2892   Register PtrReg = LoadMI.getPointerReg();
2893   LLT DstTy = MRI.getType(DstReg);
2894   MachineMemOperand &MMO = LoadMI.getMMO();
2895   LLT MemTy = MMO.getMemoryType();
2896   MachineFunction &MF = MIRBuilder.getMF();
2897 
2898   unsigned MemSizeInBits = MemTy.getSizeInBits();
2899   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2900 
2901   if (MemSizeInBits != MemStoreSizeInBits) {
2902     if (MemTy.isVector())
2903       return UnableToLegalize;
2904 
2905     // Promote to a byte-sized load if not loading an integral number of
2906     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2907     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2908     MachineMemOperand *NewMMO =
2909         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2910 
2911     Register LoadReg = DstReg;
2912     LLT LoadTy = DstTy;
2913 
2914     // If this wasn't already an extending load, we need to widen the result
2915     // register to avoid creating a load with a narrower result than the source.
2916     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2917       LoadTy = WideMemTy;
2918       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2919     }
2920 
2921     if (isa<GSExtLoad>(LoadMI)) {
2922       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2923       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2924     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) {
2925       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2926       // The extra bits are guaranteed to be zero, since we stored them that
2927       // way.  A zext load from Wide thus automatically gives zext from MemVT.
2928       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
2929     } else {
2930       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
2931     }
2932 
2933     if (DstTy != LoadTy)
2934       MIRBuilder.buildTrunc(DstReg, LoadReg);
2935 
2936     LoadMI.eraseFromParent();
2937     return Legalized;
2938   }
2939 
2940   // Big endian lowering not implemented.
2941   if (MIRBuilder.getDataLayout().isBigEndian())
2942     return UnableToLegalize;
2943 
2944   // This load needs splitting into power of 2 sized loads.
2945   //
2946   // Our strategy here is to generate anyextending loads for the smaller
2947   // types up to next power-2 result type, and then combine the two larger
2948   // result values together, before truncating back down to the non-pow-2
2949   // type.
2950   // E.g. v1 = i24 load =>
2951   // v2 = i32 zextload (2 byte)
2952   // v3 = i32 load (1 byte)
2953   // v4 = i32 shl v3, 16
2954   // v5 = i32 or v4, v2
2955   // v1 = i24 trunc v5
2956   // By doing this we generate the correct truncate which should get
2957   // combined away as an artifact with a matching extend.
2958 
2959   uint64_t LargeSplitSize, SmallSplitSize;
2960 
2961   if (!isPowerOf2_32(MemSizeInBits)) {
2962     // This load needs splitting into power of 2 sized loads.
2963     LargeSplitSize = PowerOf2Floor(MemSizeInBits);
2964     SmallSplitSize = MemSizeInBits - LargeSplitSize;
2965   } else {
2966     // This is already a power of 2, but we still need to split this in half.
2967     //
2968     // Assume we're being asked to decompose an unaligned load.
2969     // TODO: If this requires multiple splits, handle them all at once.
2970     auto &Ctx = MF.getFunction().getContext();
2971     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
2972       return UnableToLegalize;
2973 
2974     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
2975   }
2976 
2977   if (MemTy.isVector()) {
2978     // TODO: Handle vector extloads
2979     if (MemTy != DstTy)
2980       return UnableToLegalize;
2981 
2982     // TODO: We can do better than scalarizing the vector and at least split it
2983     // in half.
2984     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
2985   }
2986 
2987   MachineMemOperand *LargeMMO =
2988       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
2989   MachineMemOperand *SmallMMO =
2990       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
2991 
2992   LLT PtrTy = MRI.getType(PtrReg);
2993   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
2994   LLT AnyExtTy = LLT::scalar(AnyExtSize);
2995   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
2996                                              PtrReg, *LargeMMO);
2997 
2998   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
2999                                             LargeSplitSize / 8);
3000   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3001   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3002   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3003                                              SmallPtr, *SmallMMO);
3004 
3005   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3006   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3007 
3008   if (AnyExtTy == DstTy)
3009     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3010   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3011     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3012     MIRBuilder.buildTrunc(DstReg, {Or});
3013   } else {
3014     assert(DstTy.isPointer() && "expected pointer");
3015     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3016 
3017     // FIXME: We currently consider this to be illegal for non-integral address
3018     // spaces, but we need still need a way to reinterpret the bits.
3019     MIRBuilder.buildIntToPtr(DstReg, Or);
3020   }
3021 
3022   LoadMI.eraseFromParent();
3023   return Legalized;
3024 }
3025 
3026 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3027   // Lower a non-power of 2 store into multiple pow-2 stores.
3028   // E.g. split an i24 store into an i16 store + i8 store.
3029   // We do this by first extending the stored value to the next largest power
3030   // of 2 type, and then using truncating stores to store the components.
3031   // By doing this, likewise with G_LOAD, generate an extend that can be
3032   // artifact-combined away instead of leaving behind extracts.
3033   Register SrcReg = StoreMI.getValueReg();
3034   Register PtrReg = StoreMI.getPointerReg();
3035   LLT SrcTy = MRI.getType(SrcReg);
3036   MachineFunction &MF = MIRBuilder.getMF();
3037   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3038   LLT MemTy = MMO.getMemoryType();
3039 
3040   unsigned StoreWidth = MemTy.getSizeInBits();
3041   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3042 
3043   if (StoreWidth != StoreSizeInBits) {
3044     if (SrcTy.isVector())
3045       return UnableToLegalize;
3046 
3047     // Promote to a byte-sized store with upper bits zero if not
3048     // storing an integral number of bytes.  For example, promote
3049     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3050     LLT WideTy = LLT::scalar(StoreSizeInBits);
3051 
3052     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3053       // Avoid creating a store with a narrower source than result.
3054       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3055       SrcTy = WideTy;
3056     }
3057 
3058     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3059 
3060     MachineMemOperand *NewMMO =
3061         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3062     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3063     StoreMI.eraseFromParent();
3064     return Legalized;
3065   }
3066 
3067   if (MemTy.isVector()) {
3068     // TODO: Handle vector trunc stores
3069     if (MemTy != SrcTy)
3070       return UnableToLegalize;
3071 
3072     // TODO: We can do better than scalarizing the vector and at least split it
3073     // in half.
3074     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3075   }
3076 
3077   unsigned MemSizeInBits = MemTy.getSizeInBits();
3078   uint64_t LargeSplitSize, SmallSplitSize;
3079 
3080   if (!isPowerOf2_32(MemSizeInBits)) {
3081     LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
3082     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3083   } else {
3084     auto &Ctx = MF.getFunction().getContext();
3085     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3086       return UnableToLegalize; // Don't know what we're being asked to do.
3087 
3088     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3089   }
3090 
3091   // Extend to the next pow-2. If this store was itself the result of lowering,
3092   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3093   // that's wider than the stored size.
3094   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3095   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3096 
3097   if (SrcTy.isPointer()) {
3098     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3099     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3100   }
3101 
3102   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3103 
3104   // Obtain the smaller value by shifting away the larger value.
3105   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3106   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3107 
3108   // Generate the PtrAdd and truncating stores.
3109   LLT PtrTy = MRI.getType(PtrReg);
3110   auto OffsetCst = MIRBuilder.buildConstant(
3111     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3112   auto SmallPtr =
3113     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3114 
3115   MachineMemOperand *LargeMMO =
3116     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3117   MachineMemOperand *SmallMMO =
3118     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3119   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3120   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3121   StoreMI.eraseFromParent();
3122   return Legalized;
3123 }
3124 
3125 LegalizerHelper::LegalizeResult
3126 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3127   switch (MI.getOpcode()) {
3128   case TargetOpcode::G_LOAD: {
3129     if (TypeIdx != 0)
3130       return UnableToLegalize;
3131     MachineMemOperand &MMO = **MI.memoperands_begin();
3132 
3133     // Not sure how to interpret a bitcast of an extending load.
3134     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3135       return UnableToLegalize;
3136 
3137     Observer.changingInstr(MI);
3138     bitcastDst(MI, CastTy, 0);
3139     MMO.setType(CastTy);
3140     Observer.changedInstr(MI);
3141     return Legalized;
3142   }
3143   case TargetOpcode::G_STORE: {
3144     if (TypeIdx != 0)
3145       return UnableToLegalize;
3146 
3147     MachineMemOperand &MMO = **MI.memoperands_begin();
3148 
3149     // Not sure how to interpret a bitcast of a truncating store.
3150     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3151       return UnableToLegalize;
3152 
3153     Observer.changingInstr(MI);
3154     bitcastSrc(MI, CastTy, 0);
3155     MMO.setType(CastTy);
3156     Observer.changedInstr(MI);
3157     return Legalized;
3158   }
3159   case TargetOpcode::G_SELECT: {
3160     if (TypeIdx != 0)
3161       return UnableToLegalize;
3162 
3163     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3164       LLVM_DEBUG(
3165           dbgs() << "bitcast action not implemented for vector select\n");
3166       return UnableToLegalize;
3167     }
3168 
3169     Observer.changingInstr(MI);
3170     bitcastSrc(MI, CastTy, 2);
3171     bitcastSrc(MI, CastTy, 3);
3172     bitcastDst(MI, CastTy, 0);
3173     Observer.changedInstr(MI);
3174     return Legalized;
3175   }
3176   case TargetOpcode::G_AND:
3177   case TargetOpcode::G_OR:
3178   case TargetOpcode::G_XOR: {
3179     Observer.changingInstr(MI);
3180     bitcastSrc(MI, CastTy, 1);
3181     bitcastSrc(MI, CastTy, 2);
3182     bitcastDst(MI, CastTy, 0);
3183     Observer.changedInstr(MI);
3184     return Legalized;
3185   }
3186   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3187     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3188   case TargetOpcode::G_INSERT_VECTOR_ELT:
3189     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3190   default:
3191     return UnableToLegalize;
3192   }
3193 }
3194 
3195 // Legalize an instruction by changing the opcode in place.
3196 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3197     Observer.changingInstr(MI);
3198     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3199     Observer.changedInstr(MI);
3200 }
3201 
3202 LegalizerHelper::LegalizeResult
3203 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3204   using namespace TargetOpcode;
3205 
3206   switch(MI.getOpcode()) {
3207   default:
3208     return UnableToLegalize;
3209   case TargetOpcode::G_BITCAST:
3210     return lowerBitcast(MI);
3211   case TargetOpcode::G_SREM:
3212   case TargetOpcode::G_UREM: {
3213     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3214     auto Quot =
3215         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3216                               {MI.getOperand(1), MI.getOperand(2)});
3217 
3218     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3219     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3220     MI.eraseFromParent();
3221     return Legalized;
3222   }
3223   case TargetOpcode::G_SADDO:
3224   case TargetOpcode::G_SSUBO:
3225     return lowerSADDO_SSUBO(MI);
3226   case TargetOpcode::G_UMULH:
3227   case TargetOpcode::G_SMULH:
3228     return lowerSMULH_UMULH(MI);
3229   case TargetOpcode::G_SMULO:
3230   case TargetOpcode::G_UMULO: {
3231     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3232     // result.
3233     Register Res = MI.getOperand(0).getReg();
3234     Register Overflow = MI.getOperand(1).getReg();
3235     Register LHS = MI.getOperand(2).getReg();
3236     Register RHS = MI.getOperand(3).getReg();
3237     LLT Ty = MRI.getType(Res);
3238 
3239     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3240                           ? TargetOpcode::G_SMULH
3241                           : TargetOpcode::G_UMULH;
3242 
3243     Observer.changingInstr(MI);
3244     const auto &TII = MIRBuilder.getTII();
3245     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3246     MI.RemoveOperand(1);
3247     Observer.changedInstr(MI);
3248 
3249     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3250     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3251 
3252     // Move insert point forward so we can use the Res register if needed.
3253     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3254 
3255     // For *signed* multiply, overflow is detected by checking:
3256     // (hi != (lo >> bitwidth-1))
3257     if (Opcode == TargetOpcode::G_SMULH) {
3258       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3259       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3260       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3261     } else {
3262       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3263     }
3264     return Legalized;
3265   }
3266   case TargetOpcode::G_FNEG: {
3267     Register Res = MI.getOperand(0).getReg();
3268     LLT Ty = MRI.getType(Res);
3269 
3270     // TODO: Handle vector types once we are able to
3271     // represent them.
3272     if (Ty.isVector())
3273       return UnableToLegalize;
3274     auto SignMask =
3275         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3276     Register SubByReg = MI.getOperand(1).getReg();
3277     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3278     MI.eraseFromParent();
3279     return Legalized;
3280   }
3281   case TargetOpcode::G_FSUB: {
3282     Register Res = MI.getOperand(0).getReg();
3283     LLT Ty = MRI.getType(Res);
3284 
3285     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3286     // First, check if G_FNEG is marked as Lower. If so, we may
3287     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3288     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3289       return UnableToLegalize;
3290     Register LHS = MI.getOperand(1).getReg();
3291     Register RHS = MI.getOperand(2).getReg();
3292     Register Neg = MRI.createGenericVirtualRegister(Ty);
3293     MIRBuilder.buildFNeg(Neg, RHS);
3294     MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3295     MI.eraseFromParent();
3296     return Legalized;
3297   }
3298   case TargetOpcode::G_FMAD:
3299     return lowerFMad(MI);
3300   case TargetOpcode::G_FFLOOR:
3301     return lowerFFloor(MI);
3302   case TargetOpcode::G_INTRINSIC_ROUND:
3303     return lowerIntrinsicRound(MI);
3304   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3305     // Since round even is the assumed rounding mode for unconstrained FP
3306     // operations, rint and roundeven are the same operation.
3307     changeOpcode(MI, TargetOpcode::G_FRINT);
3308     return Legalized;
3309   }
3310   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3311     Register OldValRes = MI.getOperand(0).getReg();
3312     Register SuccessRes = MI.getOperand(1).getReg();
3313     Register Addr = MI.getOperand(2).getReg();
3314     Register CmpVal = MI.getOperand(3).getReg();
3315     Register NewVal = MI.getOperand(4).getReg();
3316     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3317                                   **MI.memoperands_begin());
3318     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3319     MI.eraseFromParent();
3320     return Legalized;
3321   }
3322   case TargetOpcode::G_LOAD:
3323   case TargetOpcode::G_SEXTLOAD:
3324   case TargetOpcode::G_ZEXTLOAD:
3325     return lowerLoad(cast<GAnyLoad>(MI));
3326   case TargetOpcode::G_STORE:
3327     return lowerStore(cast<GStore>(MI));
3328   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3329   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3330   case TargetOpcode::G_CTLZ:
3331   case TargetOpcode::G_CTTZ:
3332   case TargetOpcode::G_CTPOP:
3333     return lowerBitCount(MI);
3334   case G_UADDO: {
3335     Register Res = MI.getOperand(0).getReg();
3336     Register CarryOut = MI.getOperand(1).getReg();
3337     Register LHS = MI.getOperand(2).getReg();
3338     Register RHS = MI.getOperand(3).getReg();
3339 
3340     MIRBuilder.buildAdd(Res, LHS, RHS);
3341     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3342 
3343     MI.eraseFromParent();
3344     return Legalized;
3345   }
3346   case G_UADDE: {
3347     Register Res = MI.getOperand(0).getReg();
3348     Register CarryOut = MI.getOperand(1).getReg();
3349     Register LHS = MI.getOperand(2).getReg();
3350     Register RHS = MI.getOperand(3).getReg();
3351     Register CarryIn = MI.getOperand(4).getReg();
3352     LLT Ty = MRI.getType(Res);
3353 
3354     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3355     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3356     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3357     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3358 
3359     MI.eraseFromParent();
3360     return Legalized;
3361   }
3362   case G_USUBO: {
3363     Register Res = MI.getOperand(0).getReg();
3364     Register BorrowOut = MI.getOperand(1).getReg();
3365     Register LHS = MI.getOperand(2).getReg();
3366     Register RHS = MI.getOperand(3).getReg();
3367 
3368     MIRBuilder.buildSub(Res, LHS, RHS);
3369     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3370 
3371     MI.eraseFromParent();
3372     return Legalized;
3373   }
3374   case G_USUBE: {
3375     Register Res = MI.getOperand(0).getReg();
3376     Register BorrowOut = MI.getOperand(1).getReg();
3377     Register LHS = MI.getOperand(2).getReg();
3378     Register RHS = MI.getOperand(3).getReg();
3379     Register BorrowIn = MI.getOperand(4).getReg();
3380     const LLT CondTy = MRI.getType(BorrowOut);
3381     const LLT Ty = MRI.getType(Res);
3382 
3383     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3384     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3385     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3386 
3387     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3388     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3389     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3390 
3391     MI.eraseFromParent();
3392     return Legalized;
3393   }
3394   case G_UITOFP:
3395     return lowerUITOFP(MI);
3396   case G_SITOFP:
3397     return lowerSITOFP(MI);
3398   case G_FPTOUI:
3399     return lowerFPTOUI(MI);
3400   case G_FPTOSI:
3401     return lowerFPTOSI(MI);
3402   case G_FPTRUNC:
3403     return lowerFPTRUNC(MI);
3404   case G_FPOWI:
3405     return lowerFPOWI(MI);
3406   case G_SMIN:
3407   case G_SMAX:
3408   case G_UMIN:
3409   case G_UMAX:
3410     return lowerMinMax(MI);
3411   case G_FCOPYSIGN:
3412     return lowerFCopySign(MI);
3413   case G_FMINNUM:
3414   case G_FMAXNUM:
3415     return lowerFMinNumMaxNum(MI);
3416   case G_MERGE_VALUES:
3417     return lowerMergeValues(MI);
3418   case G_UNMERGE_VALUES:
3419     return lowerUnmergeValues(MI);
3420   case TargetOpcode::G_SEXT_INREG: {
3421     assert(MI.getOperand(2).isImm() && "Expected immediate");
3422     int64_t SizeInBits = MI.getOperand(2).getImm();
3423 
3424     Register DstReg = MI.getOperand(0).getReg();
3425     Register SrcReg = MI.getOperand(1).getReg();
3426     LLT DstTy = MRI.getType(DstReg);
3427     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3428 
3429     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3430     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3431     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3432     MI.eraseFromParent();
3433     return Legalized;
3434   }
3435   case G_EXTRACT_VECTOR_ELT:
3436   case G_INSERT_VECTOR_ELT:
3437     return lowerExtractInsertVectorElt(MI);
3438   case G_SHUFFLE_VECTOR:
3439     return lowerShuffleVector(MI);
3440   case G_DYN_STACKALLOC:
3441     return lowerDynStackAlloc(MI);
3442   case G_EXTRACT:
3443     return lowerExtract(MI);
3444   case G_INSERT:
3445     return lowerInsert(MI);
3446   case G_BSWAP:
3447     return lowerBswap(MI);
3448   case G_BITREVERSE:
3449     return lowerBitreverse(MI);
3450   case G_READ_REGISTER:
3451   case G_WRITE_REGISTER:
3452     return lowerReadWriteRegister(MI);
3453   case G_UADDSAT:
3454   case G_USUBSAT: {
3455     // Try to make a reasonable guess about which lowering strategy to use. The
3456     // target can override this with custom lowering and calling the
3457     // implementation functions.
3458     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3459     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3460       return lowerAddSubSatToMinMax(MI);
3461     return lowerAddSubSatToAddoSubo(MI);
3462   }
3463   case G_SADDSAT:
3464   case G_SSUBSAT: {
3465     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3466 
3467     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3468     // since it's a shorter expansion. However, we would need to figure out the
3469     // preferred boolean type for the carry out for the query.
3470     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3471       return lowerAddSubSatToMinMax(MI);
3472     return lowerAddSubSatToAddoSubo(MI);
3473   }
3474   case G_SSHLSAT:
3475   case G_USHLSAT:
3476     return lowerShlSat(MI);
3477   case G_ABS:
3478     return lowerAbsToAddXor(MI);
3479   case G_SELECT:
3480     return lowerSelect(MI);
3481   case G_SDIVREM:
3482   case G_UDIVREM:
3483     return lowerDIVREM(MI);
3484   case G_FSHL:
3485   case G_FSHR:
3486     return lowerFunnelShift(MI);
3487   case G_ROTL:
3488   case G_ROTR:
3489     return lowerRotate(MI);
3490   case G_MEMSET:
3491   case G_MEMCPY:
3492   case G_MEMMOVE:
3493     return lowerMemCpyFamily(MI);
3494   case G_MEMCPY_INLINE:
3495     return lowerMemcpyInline(MI);
3496   GISEL_VECREDUCE_CASES_NONSEQ
3497     return lowerVectorReduction(MI);
3498   }
3499 }
3500 
3501 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3502                                                   Align MinAlign) const {
3503   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3504   // datalayout for the preferred alignment. Also there should be a target hook
3505   // for this to allow targets to reduce the alignment and ignore the
3506   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3507   // the type.
3508   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3509 }
3510 
3511 MachineInstrBuilder
3512 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3513                                       MachinePointerInfo &PtrInfo) {
3514   MachineFunction &MF = MIRBuilder.getMF();
3515   const DataLayout &DL = MIRBuilder.getDataLayout();
3516   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3517 
3518   unsigned AddrSpace = DL.getAllocaAddrSpace();
3519   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3520 
3521   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3522   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3523 }
3524 
3525 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3526                                         LLT VecTy) {
3527   int64_t IdxVal;
3528   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3529     return IdxReg;
3530 
3531   LLT IdxTy = B.getMRI()->getType(IdxReg);
3532   unsigned NElts = VecTy.getNumElements();
3533   if (isPowerOf2_32(NElts)) {
3534     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3535     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3536   }
3537 
3538   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3539       .getReg(0);
3540 }
3541 
3542 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3543                                                   Register Index) {
3544   LLT EltTy = VecTy.getElementType();
3545 
3546   // Calculate the element offset and add it to the pointer.
3547   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3548   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3549          "Converting bits to bytes lost precision");
3550 
3551   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3552 
3553   LLT IdxTy = MRI.getType(Index);
3554   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3555                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3556 
3557   LLT PtrTy = MRI.getType(VecPtr);
3558   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3559 }
3560 
3561 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef(
3562     MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) {
3563   Register DstReg = MI.getOperand(0).getReg();
3564   LLT DstTy = MRI.getType(DstReg);
3565   LLT LCMTy = getLCMType(DstTy, NarrowTy);
3566 
3567   unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
3568 
3569   auto NewUndef = MIRBuilder.buildUndef(NarrowTy);
3570   SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0));
3571 
3572   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
3573   MI.eraseFromParent();
3574   return Legalized;
3575 }
3576 
3577 // Handle splitting vector operations which need to have the same number of
3578 // elements in each type index, but each type index may have a different element
3579 // type.
3580 //
3581 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3582 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3583 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3584 //
3585 // Also handles some irregular breakdown cases, e.g.
3586 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3587 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3588 //             s64 = G_SHL s64, s32
3589 LegalizerHelper::LegalizeResult
3590 LegalizerHelper::fewerElementsVectorMultiEltType(
3591   MachineInstr &MI, unsigned TypeIdx, LLT NarrowTyArg) {
3592   if (TypeIdx != 0)
3593     return UnableToLegalize;
3594 
3595   const LLT NarrowTy0 = NarrowTyArg;
3596   const Register DstReg = MI.getOperand(0).getReg();
3597   LLT DstTy = MRI.getType(DstReg);
3598   LLT LeftoverTy0;
3599 
3600   // All of the operands need to have the same number of elements, so if we can
3601   // determine a type breakdown for the result type, we can for all of the
3602   // source types.
3603   int NumParts = getNarrowTypeBreakDown(DstTy, NarrowTy0, LeftoverTy0).first;
3604   if (NumParts < 0)
3605     return UnableToLegalize;
3606 
3607   SmallVector<MachineInstrBuilder, 4> NewInsts;
3608 
3609   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3610   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3611 
3612   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
3613     Register SrcReg = MI.getOperand(I).getReg();
3614     LLT SrcTyI = MRI.getType(SrcReg);
3615     const auto NewEC = NarrowTy0.isVector() ? NarrowTy0.getElementCount()
3616                                             : ElementCount::getFixed(1);
3617     LLT NarrowTyI = LLT::scalarOrVector(NewEC, SrcTyI.getScalarType());
3618     LLT LeftoverTyI;
3619 
3620     // Split this operand into the requested typed registers, and any leftover
3621     // required to reproduce the original type.
3622     if (!extractParts(SrcReg, SrcTyI, NarrowTyI, LeftoverTyI, PartRegs,
3623                       LeftoverRegs))
3624       return UnableToLegalize;
3625 
3626     if (I == 1) {
3627       // For the first operand, create an instruction for each part and setup
3628       // the result.
3629       for (Register PartReg : PartRegs) {
3630         Register PartDstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3631         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3632                                .addDef(PartDstReg)
3633                                .addUse(PartReg));
3634         DstRegs.push_back(PartDstReg);
3635       }
3636 
3637       for (Register LeftoverReg : LeftoverRegs) {
3638         Register PartDstReg = MRI.createGenericVirtualRegister(LeftoverTy0);
3639         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3640                                .addDef(PartDstReg)
3641                                .addUse(LeftoverReg));
3642         LeftoverDstRegs.push_back(PartDstReg);
3643       }
3644     } else {
3645       assert(NewInsts.size() == PartRegs.size() + LeftoverRegs.size());
3646 
3647       // Add the newly created operand splits to the existing instructions. The
3648       // odd-sized pieces are ordered after the requested NarrowTyArg sized
3649       // pieces.
3650       unsigned InstCount = 0;
3651       for (unsigned J = 0, JE = PartRegs.size(); J != JE; ++J)
3652         NewInsts[InstCount++].addUse(PartRegs[J]);
3653       for (unsigned J = 0, JE = LeftoverRegs.size(); J != JE; ++J)
3654         NewInsts[InstCount++].addUse(LeftoverRegs[J]);
3655     }
3656 
3657     PartRegs.clear();
3658     LeftoverRegs.clear();
3659   }
3660 
3661   // Insert the newly built operations and rebuild the result register.
3662   for (auto &MIB : NewInsts)
3663     MIRBuilder.insertInstr(MIB);
3664 
3665   insertParts(DstReg, DstTy, NarrowTy0, DstRegs, LeftoverTy0, LeftoverDstRegs);
3666 
3667   MI.eraseFromParent();
3668   return Legalized;
3669 }
3670 
3671 LegalizerHelper::LegalizeResult
3672 LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx,
3673                                           LLT NarrowTy) {
3674   if (TypeIdx != 0)
3675     return UnableToLegalize;
3676 
3677   Register DstReg = MI.getOperand(0).getReg();
3678   Register SrcReg = MI.getOperand(1).getReg();
3679   LLT DstTy = MRI.getType(DstReg);
3680   LLT SrcTy = MRI.getType(SrcReg);
3681 
3682   LLT NarrowTy0 = NarrowTy;
3683   LLT NarrowTy1;
3684   unsigned NumParts;
3685 
3686   if (NarrowTy.isVector()) {
3687     // Uneven breakdown not handled.
3688     NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3689     if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements())
3690       return UnableToLegalize;
3691 
3692     NarrowTy1 = LLT::vector(NarrowTy.getElementCount(), SrcTy.getElementType());
3693   } else {
3694     NumParts = DstTy.getNumElements();
3695     NarrowTy1 = SrcTy.getElementType();
3696   }
3697 
3698   SmallVector<Register, 4> SrcRegs, DstRegs;
3699   extractParts(SrcReg, NarrowTy1, NumParts, SrcRegs);
3700 
3701   for (unsigned I = 0; I < NumParts; ++I) {
3702     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3703     MachineInstr *NewInst =
3704         MIRBuilder.buildInstr(MI.getOpcode(), {DstReg}, {SrcRegs[I]});
3705 
3706     NewInst->setFlags(MI.getFlags());
3707     DstRegs.push_back(DstReg);
3708   }
3709 
3710   if (NarrowTy.isVector())
3711     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3712   else
3713     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3714 
3715   MI.eraseFromParent();
3716   return Legalized;
3717 }
3718 
3719 LegalizerHelper::LegalizeResult
3720 LegalizerHelper::fewerElementsVectorCmp(MachineInstr &MI, unsigned TypeIdx,
3721                                         LLT NarrowTy) {
3722   Register DstReg = MI.getOperand(0).getReg();
3723   Register Src0Reg = MI.getOperand(2).getReg();
3724   LLT DstTy = MRI.getType(DstReg);
3725   LLT SrcTy = MRI.getType(Src0Reg);
3726 
3727   unsigned NumParts;
3728   LLT NarrowTy0, NarrowTy1;
3729 
3730   if (TypeIdx == 0) {
3731     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3732     unsigned OldElts = DstTy.getNumElements();
3733 
3734     NarrowTy0 = NarrowTy;
3735     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : DstTy.getNumElements();
3736     NarrowTy1 = NarrowTy.isVector() ? LLT::vector(NarrowTy.getElementCount(),
3737                                                   SrcTy.getScalarSizeInBits())
3738                                     : SrcTy.getElementType();
3739 
3740   } else {
3741     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3742     unsigned OldElts = SrcTy.getNumElements();
3743 
3744     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) :
3745       NarrowTy.getNumElements();
3746     NarrowTy0 =
3747         LLT::vector(NarrowTy.getElementCount(), DstTy.getScalarSizeInBits());
3748     NarrowTy1 = NarrowTy;
3749   }
3750 
3751   // FIXME: Don't know how to handle the situation where the small vectors
3752   // aren't all the same size yet.
3753   if (NarrowTy1.isVector() &&
3754       NarrowTy1.getNumElements() * NumParts != DstTy.getNumElements())
3755     return UnableToLegalize;
3756 
3757   CmpInst::Predicate Pred
3758     = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3759 
3760   SmallVector<Register, 2> Src1Regs, Src2Regs, DstRegs;
3761   extractParts(MI.getOperand(2).getReg(), NarrowTy1, NumParts, Src1Regs);
3762   extractParts(MI.getOperand(3).getReg(), NarrowTy1, NumParts, Src2Regs);
3763 
3764   for (unsigned I = 0; I < NumParts; ++I) {
3765     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3766     DstRegs.push_back(DstReg);
3767 
3768     if (MI.getOpcode() == TargetOpcode::G_ICMP)
3769       MIRBuilder.buildICmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3770     else {
3771       MachineInstr *NewCmp
3772         = MIRBuilder.buildFCmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3773       NewCmp->setFlags(MI.getFlags());
3774     }
3775   }
3776 
3777   if (NarrowTy1.isVector())
3778     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3779   else
3780     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3781 
3782   MI.eraseFromParent();
3783   return Legalized;
3784 }
3785 
3786 LegalizerHelper::LegalizeResult
3787 LegalizerHelper::fewerElementsVectorSelect(MachineInstr &MI, unsigned TypeIdx,
3788                                            LLT NarrowTy) {
3789   Register DstReg = MI.getOperand(0).getReg();
3790   Register CondReg = MI.getOperand(1).getReg();
3791 
3792   unsigned NumParts = 0;
3793   LLT NarrowTy0, NarrowTy1;
3794 
3795   LLT DstTy = MRI.getType(DstReg);
3796   LLT CondTy = MRI.getType(CondReg);
3797   unsigned Size = DstTy.getSizeInBits();
3798 
3799   assert(TypeIdx == 0 || CondTy.isVector());
3800 
3801   if (TypeIdx == 0) {
3802     NarrowTy0 = NarrowTy;
3803     NarrowTy1 = CondTy;
3804 
3805     unsigned NarrowSize = NarrowTy0.getSizeInBits();
3806     // FIXME: Don't know how to handle the situation where the small vectors
3807     // aren't all the same size yet.
3808     if (Size % NarrowSize != 0)
3809       return UnableToLegalize;
3810 
3811     NumParts = Size / NarrowSize;
3812 
3813     // Need to break down the condition type
3814     if (CondTy.isVector()) {
3815       if (CondTy.getNumElements() == NumParts)
3816         NarrowTy1 = CondTy.getElementType();
3817       else
3818         NarrowTy1 =
3819             LLT::vector(CondTy.getElementCount().divideCoefficientBy(NumParts),
3820                         CondTy.getScalarSizeInBits());
3821     }
3822   } else {
3823     NumParts = CondTy.getNumElements();
3824     if (NarrowTy.isVector()) {
3825       // TODO: Handle uneven breakdown.
3826       if (NumParts * NarrowTy.getNumElements() != CondTy.getNumElements())
3827         return UnableToLegalize;
3828 
3829       return UnableToLegalize;
3830     } else {
3831       NarrowTy0 = DstTy.getElementType();
3832       NarrowTy1 = NarrowTy;
3833     }
3834   }
3835 
3836   SmallVector<Register, 2> DstRegs, Src0Regs, Src1Regs, Src2Regs;
3837   if (CondTy.isVector())
3838     extractParts(MI.getOperand(1).getReg(), NarrowTy1, NumParts, Src0Regs);
3839 
3840   extractParts(MI.getOperand(2).getReg(), NarrowTy0, NumParts, Src1Regs);
3841   extractParts(MI.getOperand(3).getReg(), NarrowTy0, NumParts, Src2Regs);
3842 
3843   for (unsigned i = 0; i < NumParts; ++i) {
3844     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3845     MIRBuilder.buildSelect(DstReg, CondTy.isVector() ? Src0Regs[i] : CondReg,
3846                            Src1Regs[i], Src2Regs[i]);
3847     DstRegs.push_back(DstReg);
3848   }
3849 
3850   if (NarrowTy0.isVector())
3851     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3852   else
3853     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3854 
3855   MI.eraseFromParent();
3856   return Legalized;
3857 }
3858 
3859 LegalizerHelper::LegalizeResult
3860 LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
3861                                         LLT NarrowTy) {
3862   const Register DstReg = MI.getOperand(0).getReg();
3863   LLT PhiTy = MRI.getType(DstReg);
3864   LLT LeftoverTy;
3865 
3866   // All of the operands need to have the same number of elements, so if we can
3867   // determine a type breakdown for the result type, we can for all of the
3868   // source types.
3869   int NumParts, NumLeftover;
3870   std::tie(NumParts, NumLeftover)
3871     = getNarrowTypeBreakDown(PhiTy, NarrowTy, LeftoverTy);
3872   if (NumParts < 0)
3873     return UnableToLegalize;
3874 
3875   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3876   SmallVector<MachineInstrBuilder, 4> NewInsts;
3877 
3878   const int TotalNumParts = NumParts + NumLeftover;
3879 
3880   // Insert the new phis in the result block first.
3881   for (int I = 0; I != TotalNumParts; ++I) {
3882     LLT Ty = I < NumParts ? NarrowTy : LeftoverTy;
3883     Register PartDstReg = MRI.createGenericVirtualRegister(Ty);
3884     NewInsts.push_back(MIRBuilder.buildInstr(TargetOpcode::G_PHI)
3885                        .addDef(PartDstReg));
3886     if (I < NumParts)
3887       DstRegs.push_back(PartDstReg);
3888     else
3889       LeftoverDstRegs.push_back(PartDstReg);
3890   }
3891 
3892   MachineBasicBlock *MBB = MI.getParent();
3893   MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI());
3894   insertParts(DstReg, PhiTy, NarrowTy, DstRegs, LeftoverTy, LeftoverDstRegs);
3895 
3896   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3897 
3898   // Insert code to extract the incoming values in each predecessor block.
3899   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3900     PartRegs.clear();
3901     LeftoverRegs.clear();
3902 
3903     Register SrcReg = MI.getOperand(I).getReg();
3904     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
3905     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3906 
3907     LLT Unused;
3908     if (!extractParts(SrcReg, PhiTy, NarrowTy, Unused, PartRegs,
3909                       LeftoverRegs))
3910       return UnableToLegalize;
3911 
3912     // Add the newly created operand splits to the existing instructions. The
3913     // odd-sized pieces are ordered after the requested NarrowTyArg sized
3914     // pieces.
3915     for (int J = 0; J != TotalNumParts; ++J) {
3916       MachineInstrBuilder MIB = NewInsts[J];
3917       MIB.addUse(J < NumParts ? PartRegs[J] : LeftoverRegs[J - NumParts]);
3918       MIB.addMBB(&OpMBB);
3919     }
3920   }
3921 
3922   MI.eraseFromParent();
3923   return Legalized;
3924 }
3925 
3926 LegalizerHelper::LegalizeResult
3927 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3928                                                   unsigned TypeIdx,
3929                                                   LLT NarrowTy) {
3930   if (TypeIdx != 1)
3931     return UnableToLegalize;
3932 
3933   const int NumDst = MI.getNumOperands() - 1;
3934   const Register SrcReg = MI.getOperand(NumDst).getReg();
3935   LLT SrcTy = MRI.getType(SrcReg);
3936 
3937   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3938 
3939   // TODO: Create sequence of extracts.
3940   if (DstTy == NarrowTy)
3941     return UnableToLegalize;
3942 
3943   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3944   if (DstTy == GCDTy) {
3945     // This would just be a copy of the same unmerge.
3946     // TODO: Create extracts, pad with undef and create intermediate merges.
3947     return UnableToLegalize;
3948   }
3949 
3950   auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
3951   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3952   const int PartsPerUnmerge = NumDst / NumUnmerge;
3953 
3954   for (int I = 0; I != NumUnmerge; ++I) {
3955     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3956 
3957     for (int J = 0; J != PartsPerUnmerge; ++J)
3958       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3959     MIB.addUse(Unmerge.getReg(I));
3960   }
3961 
3962   MI.eraseFromParent();
3963   return Legalized;
3964 }
3965 
3966 LegalizerHelper::LegalizeResult
3967 LegalizerHelper::fewerElementsVectorMulo(MachineInstr &MI, unsigned TypeIdx,
3968                                          LLT NarrowTy) {
3969   Register Result = MI.getOperand(0).getReg();
3970   Register Overflow = MI.getOperand(1).getReg();
3971   Register LHS = MI.getOperand(2).getReg();
3972   Register RHS = MI.getOperand(3).getReg();
3973 
3974   LLT SrcTy = MRI.getType(LHS);
3975   if (!SrcTy.isVector())
3976     return UnableToLegalize;
3977 
3978   LLT ElementType = SrcTy.getElementType();
3979   LLT OverflowElementTy = MRI.getType(Overflow).getElementType();
3980   const ElementCount NumResult = SrcTy.getElementCount();
3981   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3982 
3983   // Unmerge the operands to smaller parts of GCD type.
3984   auto UnmergeLHS = MIRBuilder.buildUnmerge(GCDTy, LHS);
3985   auto UnmergeRHS = MIRBuilder.buildUnmerge(GCDTy, RHS);
3986 
3987   const int NumOps = UnmergeLHS->getNumOperands() - 1;
3988   const ElementCount PartsPerUnmerge = NumResult.divideCoefficientBy(NumOps);
3989   LLT OverflowTy = LLT::scalarOrVector(PartsPerUnmerge, OverflowElementTy);
3990   LLT ResultTy = LLT::scalarOrVector(PartsPerUnmerge, ElementType);
3991 
3992   // Perform the operation over unmerged parts.
3993   SmallVector<Register, 8> ResultParts;
3994   SmallVector<Register, 8> OverflowParts;
3995   for (int I = 0; I != NumOps; ++I) {
3996     Register Operand1 = UnmergeLHS->getOperand(I).getReg();
3997     Register Operand2 = UnmergeRHS->getOperand(I).getReg();
3998     auto PartMul = MIRBuilder.buildInstr(MI.getOpcode(), {ResultTy, OverflowTy},
3999                                          {Operand1, Operand2});
4000     ResultParts.push_back(PartMul->getOperand(0).getReg());
4001     OverflowParts.push_back(PartMul->getOperand(1).getReg());
4002   }
4003 
4004   LLT ResultLCMTy = buildLCMMergePieces(SrcTy, NarrowTy, GCDTy, ResultParts);
4005   LLT OverflowLCMTy =
4006       LLT::scalarOrVector(ResultLCMTy.getElementCount(), OverflowElementTy);
4007 
4008   // Recombine the pieces to the original result and overflow registers.
4009   buildWidenedRemergeToDst(Result, ResultLCMTy, ResultParts);
4010   buildWidenedRemergeToDst(Overflow, OverflowLCMTy, OverflowParts);
4011   MI.eraseFromParent();
4012   return Legalized;
4013 }
4014 
4015 // Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces
4016 // a vector
4017 //
4018 // Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with
4019 // undef as necessary.
4020 //
4021 // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
4022 //   -> <2 x s16>
4023 //
4024 // %4:_(s16) = G_IMPLICIT_DEF
4025 // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
4026 // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
4027 // %7:_(<2 x s16>) = G_IMPLICIT_DEF
4028 // %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7
4029 // %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8
4030 LegalizerHelper::LegalizeResult
4031 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
4032                                           LLT NarrowTy) {
4033   Register DstReg = MI.getOperand(0).getReg();
4034   LLT DstTy = MRI.getType(DstReg);
4035   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
4036   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
4037 
4038   // Break into a common type
4039   SmallVector<Register, 16> Parts;
4040   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
4041     extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg());
4042 
4043   // Build the requested new merge, padding with undef.
4044   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts,
4045                                   TargetOpcode::G_ANYEXT);
4046 
4047   // Pack into the original result register.
4048   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4049 
4050   MI.eraseFromParent();
4051   return Legalized;
4052 }
4053 
4054 LegalizerHelper::LegalizeResult
4055 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
4056                                                            unsigned TypeIdx,
4057                                                            LLT NarrowVecTy) {
4058   Register DstReg = MI.getOperand(0).getReg();
4059   Register SrcVec = MI.getOperand(1).getReg();
4060   Register InsertVal;
4061   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
4062 
4063   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
4064   if (IsInsert)
4065     InsertVal = MI.getOperand(2).getReg();
4066 
4067   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
4068 
4069   // TODO: Handle total scalarization case.
4070   if (!NarrowVecTy.isVector())
4071     return UnableToLegalize;
4072 
4073   LLT VecTy = MRI.getType(SrcVec);
4074 
4075   // If the index is a constant, we can really break this down as you would
4076   // expect, and index into the target size pieces.
4077   int64_t IdxVal;
4078   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
4079   if (MaybeCst) {
4080     IdxVal = MaybeCst->Value.getSExtValue();
4081     // Avoid out of bounds indexing the pieces.
4082     if (IdxVal >= VecTy.getNumElements()) {
4083       MIRBuilder.buildUndef(DstReg);
4084       MI.eraseFromParent();
4085       return Legalized;
4086     }
4087 
4088     SmallVector<Register, 8> VecParts;
4089     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4090 
4091     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4092     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4093                                     TargetOpcode::G_ANYEXT);
4094 
4095     unsigned NewNumElts = NarrowVecTy.getNumElements();
4096 
4097     LLT IdxTy = MRI.getType(Idx);
4098     int64_t PartIdx = IdxVal / NewNumElts;
4099     auto NewIdx =
4100         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4101 
4102     if (IsInsert) {
4103       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4104 
4105       // Use the adjusted index to insert into one of the subvectors.
4106       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4107           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4108       VecParts[PartIdx] = InsertPart.getReg(0);
4109 
4110       // Recombine the inserted subvector with the others to reform the result
4111       // vector.
4112       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4113     } else {
4114       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4115     }
4116 
4117     MI.eraseFromParent();
4118     return Legalized;
4119   }
4120 
4121   // With a variable index, we can't perform the operation in a smaller type, so
4122   // we're forced to expand this.
4123   //
4124   // TODO: We could emit a chain of compare/select to figure out which piece to
4125   // index.
4126   return lowerExtractInsertVectorElt(MI);
4127 }
4128 
4129 LegalizerHelper::LegalizeResult
4130 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4131                                       LLT NarrowTy) {
4132   // FIXME: Don't know how to handle secondary types yet.
4133   if (TypeIdx != 0)
4134     return UnableToLegalize;
4135 
4136   // This implementation doesn't work for atomics. Give up instead of doing
4137   // something invalid.
4138   if (LdStMI.isAtomic())
4139     return UnableToLegalize;
4140 
4141   bool IsLoad = isa<GLoad>(LdStMI);
4142   Register ValReg = LdStMI.getReg(0);
4143   Register AddrReg = LdStMI.getPointerReg();
4144   LLT ValTy = MRI.getType(ValReg);
4145 
4146   // FIXME: Do we need a distinct NarrowMemory legalize action?
4147   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4148     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4149     return UnableToLegalize;
4150   }
4151 
4152   int NumParts = -1;
4153   int NumLeftover = -1;
4154   LLT LeftoverTy;
4155   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4156   if (IsLoad) {
4157     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4158   } else {
4159     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4160                      NarrowLeftoverRegs)) {
4161       NumParts = NarrowRegs.size();
4162       NumLeftover = NarrowLeftoverRegs.size();
4163     }
4164   }
4165 
4166   if (NumParts == -1)
4167     return UnableToLegalize;
4168 
4169   LLT PtrTy = MRI.getType(AddrReg);
4170   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4171 
4172   unsigned TotalSize = ValTy.getSizeInBits();
4173 
4174   // Split the load/store into PartTy sized pieces starting at Offset. If this
4175   // is a load, return the new registers in ValRegs. For a store, each elements
4176   // of ValRegs should be PartTy. Returns the next offset that needs to be
4177   // handled.
4178   auto MMO = LdStMI.getMMO();
4179   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4180                              unsigned Offset) -> unsigned {
4181     MachineFunction &MF = MIRBuilder.getMF();
4182     unsigned PartSize = PartTy.getSizeInBits();
4183     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4184          Offset += PartSize, ++Idx) {
4185       unsigned ByteOffset = Offset / 8;
4186       Register NewAddrReg;
4187 
4188       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4189 
4190       MachineMemOperand *NewMMO =
4191           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4192 
4193       if (IsLoad) {
4194         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4195         ValRegs.push_back(Dst);
4196         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4197       } else {
4198         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4199       }
4200     }
4201 
4202     return Offset;
4203   };
4204 
4205   unsigned HandledOffset = splitTypePieces(NarrowTy, NarrowRegs, 0);
4206 
4207   // Handle the rest of the register if this isn't an even type breakdown.
4208   if (LeftoverTy.isValid())
4209     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, HandledOffset);
4210 
4211   if (IsLoad) {
4212     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4213                 LeftoverTy, NarrowLeftoverRegs);
4214   }
4215 
4216   LdStMI.eraseFromParent();
4217   return Legalized;
4218 }
4219 
4220 LegalizerHelper::LegalizeResult
4221 LegalizerHelper::reduceOperationWidth(MachineInstr &MI, unsigned int TypeIdx,
4222                                       LLT NarrowTy) {
4223   assert(TypeIdx == 0 && "only one type index expected");
4224 
4225   const unsigned Opc = MI.getOpcode();
4226   const int NumDefOps = MI.getNumExplicitDefs();
4227   const int NumSrcOps = MI.getNumOperands() - NumDefOps;
4228   const unsigned Flags = MI.getFlags();
4229   const unsigned NarrowSize = NarrowTy.getSizeInBits();
4230   const LLT NarrowScalarTy = LLT::scalar(NarrowSize);
4231 
4232   assert(MI.getNumOperands() <= 4 && "expected instruction with either 1 "
4233                                      "result and 1-3 sources or 2 results and "
4234                                      "1-2 sources");
4235 
4236   SmallVector<Register, 2> DstRegs;
4237   for (int I = 0; I < NumDefOps; ++I)
4238     DstRegs.push_back(MI.getOperand(I).getReg());
4239 
4240   // First of all check whether we are narrowing (changing the element type)
4241   // or reducing the vector elements
4242   const LLT DstTy = MRI.getType(DstRegs[0]);
4243   const bool IsNarrow = NarrowTy.getScalarType() != DstTy.getScalarType();
4244 
4245   SmallVector<Register, 8> ExtractedRegs[3];
4246   SmallVector<Register, 8> Parts;
4247 
4248   // Break down all the sources into NarrowTy pieces we can operate on. This may
4249   // involve creating merges to a wider type, padded with undef.
4250   for (int I = 0; I != NumSrcOps; ++I) {
4251     Register SrcReg = MI.getOperand(I + NumDefOps).getReg();
4252     LLT SrcTy = MRI.getType(SrcReg);
4253 
4254     // The type to narrow SrcReg to. For narrowing, this is a smaller scalar.
4255     // For fewerElements, this is a smaller vector with the same element type.
4256     LLT OpNarrowTy;
4257     if (IsNarrow) {
4258       OpNarrowTy = NarrowScalarTy;
4259 
4260       // In case of narrowing, we need to cast vectors to scalars for this to
4261       // work properly
4262       // FIXME: Can we do without the bitcast here if we're narrowing?
4263       if (SrcTy.isVector()) {
4264         SrcTy = LLT::scalar(SrcTy.getSizeInBits());
4265         SrcReg = MIRBuilder.buildBitcast(SrcTy, SrcReg).getReg(0);
4266       }
4267     } else {
4268       auto NarrowEC = NarrowTy.isVector() ? NarrowTy.getElementCount()
4269                                           : ElementCount::getFixed(1);
4270       OpNarrowTy = LLT::scalarOrVector(NarrowEC, SrcTy.getScalarType());
4271     }
4272 
4273     LLT GCDTy = extractGCDType(ExtractedRegs[I], SrcTy, OpNarrowTy, SrcReg);
4274 
4275     // Build a sequence of NarrowTy pieces in ExtractedRegs for this operand.
4276     buildLCMMergePieces(SrcTy, OpNarrowTy, GCDTy, ExtractedRegs[I],
4277                         TargetOpcode::G_ANYEXT);
4278   }
4279 
4280   SmallVector<Register, 8> ResultRegs[2];
4281 
4282   // Input operands for each sub-instruction.
4283   SmallVector<SrcOp, 4> InputRegs(NumSrcOps, Register());
4284 
4285   int NumParts = ExtractedRegs[0].size();
4286   const unsigned DstSize = DstTy.getSizeInBits();
4287   const LLT DstScalarTy = LLT::scalar(DstSize);
4288 
4289   // Narrowing needs to use scalar types
4290   LLT DstLCMTy, NarrowDstTy;
4291   if (IsNarrow) {
4292     DstLCMTy = getLCMType(DstScalarTy, NarrowScalarTy);
4293     NarrowDstTy = NarrowScalarTy;
4294   } else {
4295     DstLCMTy = getLCMType(DstTy, NarrowTy);
4296     NarrowDstTy = NarrowTy;
4297   }
4298 
4299   // We widened the source registers to satisfy merge/unmerge size
4300   // constraints. We'll have some extra fully undef parts.
4301   const int NumRealParts = (DstSize + NarrowSize - 1) / NarrowSize;
4302 
4303   for (int I = 0; I != NumRealParts; ++I) {
4304     // Emit this instruction on each of the split pieces.
4305     for (int J = 0; J != NumSrcOps; ++J)
4306       InputRegs[J] = ExtractedRegs[J][I];
4307 
4308     MachineInstrBuilder Inst;
4309     if (NumDefOps == 1)
4310       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy}, InputRegs, Flags);
4311     else
4312       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy, NarrowDstTy}, InputRegs,
4313                                    Flags);
4314 
4315     for (int J = 0; J != NumDefOps; ++J)
4316       ResultRegs[J].push_back(Inst.getReg(J));
4317   }
4318 
4319   // Fill out the widened result with undef instead of creating instructions
4320   // with undef inputs.
4321   int NumUndefParts = NumParts - NumRealParts;
4322   if (NumUndefParts != 0) {
4323     Register Undef = MIRBuilder.buildUndef(NarrowDstTy).getReg(0);
4324     for (int I = 0; I != NumDefOps; ++I)
4325       ResultRegs[I].append(NumUndefParts, Undef);
4326   }
4327 
4328   // Extract the possibly padded result. Use a scratch register if we need to do
4329   // a final bitcast, otherwise use the original result register.
4330   Register MergeDstReg;
4331   for (int I = 0; I != NumDefOps; ++I) {
4332     if (IsNarrow && DstTy.isVector())
4333       MergeDstReg = MRI.createGenericVirtualRegister(DstScalarTy);
4334     else
4335       MergeDstReg = DstRegs[I];
4336 
4337     buildWidenedRemergeToDst(MergeDstReg, DstLCMTy, ResultRegs[I]);
4338 
4339     // Recast to vector if we narrowed a vector
4340     if (IsNarrow && DstTy.isVector())
4341       MIRBuilder.buildBitcast(DstRegs[I], MergeDstReg);
4342   }
4343 
4344   MI.eraseFromParent();
4345   return Legalized;
4346 }
4347 
4348 LegalizerHelper::LegalizeResult
4349 LegalizerHelper::fewerElementsVectorSextInReg(MachineInstr &MI, unsigned TypeIdx,
4350                                               LLT NarrowTy) {
4351   Register DstReg = MI.getOperand(0).getReg();
4352   Register SrcReg = MI.getOperand(1).getReg();
4353   int64_t Imm = MI.getOperand(2).getImm();
4354 
4355   LLT DstTy = MRI.getType(DstReg);
4356 
4357   SmallVector<Register, 8> Parts;
4358   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
4359   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts);
4360 
4361   for (Register &R : Parts)
4362     R = MIRBuilder.buildSExtInReg(NarrowTy, R, Imm).getReg(0);
4363 
4364   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4365 
4366   MI.eraseFromParent();
4367   return Legalized;
4368 }
4369 
4370 LegalizerHelper::LegalizeResult
4371 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4372                                      LLT NarrowTy) {
4373   using namespace TargetOpcode;
4374 
4375   switch (MI.getOpcode()) {
4376   case G_IMPLICIT_DEF:
4377     return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy);
4378   case G_TRUNC:
4379   case G_AND:
4380   case G_OR:
4381   case G_XOR:
4382   case G_ADD:
4383   case G_SUB:
4384   case G_MUL:
4385   case G_PTR_ADD:
4386   case G_SMULH:
4387   case G_UMULH:
4388   case G_FADD:
4389   case G_FMUL:
4390   case G_FSUB:
4391   case G_FNEG:
4392   case G_FABS:
4393   case G_FCANONICALIZE:
4394   case G_FDIV:
4395   case G_FREM:
4396   case G_FMA:
4397   case G_FMAD:
4398   case G_FPOW:
4399   case G_FEXP:
4400   case G_FEXP2:
4401   case G_FLOG:
4402   case G_FLOG2:
4403   case G_FLOG10:
4404   case G_FNEARBYINT:
4405   case G_FCEIL:
4406   case G_FFLOOR:
4407   case G_FRINT:
4408   case G_INTRINSIC_ROUND:
4409   case G_INTRINSIC_ROUNDEVEN:
4410   case G_INTRINSIC_TRUNC:
4411   case G_FCOS:
4412   case G_FSIN:
4413   case G_FSQRT:
4414   case G_BSWAP:
4415   case G_BITREVERSE:
4416   case G_SDIV:
4417   case G_UDIV:
4418   case G_SREM:
4419   case G_UREM:
4420   case G_SDIVREM:
4421   case G_UDIVREM:
4422   case G_SMIN:
4423   case G_SMAX:
4424   case G_UMIN:
4425   case G_UMAX:
4426   case G_ABS:
4427   case G_FMINNUM:
4428   case G_FMAXNUM:
4429   case G_FMINNUM_IEEE:
4430   case G_FMAXNUM_IEEE:
4431   case G_FMINIMUM:
4432   case G_FMAXIMUM:
4433   case G_FSHL:
4434   case G_FSHR:
4435   case G_ROTL:
4436   case G_ROTR:
4437   case G_FREEZE:
4438   case G_SADDSAT:
4439   case G_SSUBSAT:
4440   case G_UADDSAT:
4441   case G_USUBSAT:
4442     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
4443   case G_UMULO:
4444   case G_SMULO:
4445     return fewerElementsVectorMulo(MI, TypeIdx, NarrowTy);
4446   case G_SHL:
4447   case G_LSHR:
4448   case G_ASHR:
4449   case G_SSHLSAT:
4450   case G_USHLSAT:
4451   case G_CTLZ:
4452   case G_CTLZ_ZERO_UNDEF:
4453   case G_CTTZ:
4454   case G_CTTZ_ZERO_UNDEF:
4455   case G_CTPOP:
4456   case G_FCOPYSIGN:
4457     return fewerElementsVectorMultiEltType(MI, TypeIdx, NarrowTy);
4458   case G_ZEXT:
4459   case G_SEXT:
4460   case G_ANYEXT:
4461   case G_FPEXT:
4462   case G_FPTRUNC:
4463   case G_SITOFP:
4464   case G_UITOFP:
4465   case G_FPTOSI:
4466   case G_FPTOUI:
4467   case G_INTTOPTR:
4468   case G_PTRTOINT:
4469   case G_ADDRSPACE_CAST:
4470     return fewerElementsVectorCasts(MI, TypeIdx, NarrowTy);
4471   case G_ICMP:
4472   case G_FCMP:
4473     return fewerElementsVectorCmp(MI, TypeIdx, NarrowTy);
4474   case G_SELECT:
4475     return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy);
4476   case G_PHI:
4477     return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy);
4478   case G_UNMERGE_VALUES:
4479     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4480   case G_BUILD_VECTOR:
4481     assert(TypeIdx == 0 && "not a vector type index");
4482     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4483   case G_CONCAT_VECTORS:
4484     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4485       return UnableToLegalize;
4486     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4487   case G_EXTRACT_VECTOR_ELT:
4488   case G_INSERT_VECTOR_ELT:
4489     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4490   case G_LOAD:
4491   case G_STORE:
4492     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4493   case G_SEXT_INREG:
4494     return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy);
4495   GISEL_VECREDUCE_CASES_NONSEQ
4496     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4497   case G_SHUFFLE_VECTOR:
4498     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4499   default:
4500     return UnableToLegalize;
4501   }
4502 }
4503 
4504 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4505     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4506   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4507   if (TypeIdx != 0)
4508     return UnableToLegalize;
4509 
4510   Register DstReg = MI.getOperand(0).getReg();
4511   Register Src1Reg = MI.getOperand(1).getReg();
4512   Register Src2Reg = MI.getOperand(2).getReg();
4513   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4514   LLT DstTy = MRI.getType(DstReg);
4515   LLT Src1Ty = MRI.getType(Src1Reg);
4516   LLT Src2Ty = MRI.getType(Src2Reg);
4517   // The shuffle should be canonicalized by now.
4518   if (DstTy != Src1Ty)
4519     return UnableToLegalize;
4520   if (DstTy != Src2Ty)
4521     return UnableToLegalize;
4522 
4523   if (!isPowerOf2_32(DstTy.getNumElements()))
4524     return UnableToLegalize;
4525 
4526   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4527   // Further legalization attempts will be needed to do split further.
4528   NarrowTy =
4529       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4530   unsigned NewElts = NarrowTy.getNumElements();
4531 
4532   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4533   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4534   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4535   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4536                         SplitSrc2Regs[1]};
4537 
4538   Register Hi, Lo;
4539 
4540   // If Lo or Hi uses elements from at most two of the four input vectors, then
4541   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4542   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4543   SmallVector<int, 16> Ops;
4544   for (unsigned High = 0; High < 2; ++High) {
4545     Register &Output = High ? Hi : Lo;
4546 
4547     // Build a shuffle mask for the output, discovering on the fly which
4548     // input vectors to use as shuffle operands (recorded in InputUsed).
4549     // If building a suitable shuffle vector proves too hard, then bail
4550     // out with useBuildVector set.
4551     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4552     unsigned FirstMaskIdx = High * NewElts;
4553     bool UseBuildVector = false;
4554     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4555       // The mask element.  This indexes into the input.
4556       int Idx = Mask[FirstMaskIdx + MaskOffset];
4557 
4558       // The input vector this mask element indexes into.
4559       unsigned Input = (unsigned)Idx / NewElts;
4560 
4561       if (Input >= array_lengthof(Inputs)) {
4562         // The mask element does not index into any input vector.
4563         Ops.push_back(-1);
4564         continue;
4565       }
4566 
4567       // Turn the index into an offset from the start of the input vector.
4568       Idx -= Input * NewElts;
4569 
4570       // Find or create a shuffle vector operand to hold this input.
4571       unsigned OpNo;
4572       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4573         if (InputUsed[OpNo] == Input) {
4574           // This input vector is already an operand.
4575           break;
4576         } else if (InputUsed[OpNo] == -1U) {
4577           // Create a new operand for this input vector.
4578           InputUsed[OpNo] = Input;
4579           break;
4580         }
4581       }
4582 
4583       if (OpNo >= array_lengthof(InputUsed)) {
4584         // More than two input vectors used!  Give up on trying to create a
4585         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4586         UseBuildVector = true;
4587         break;
4588       }
4589 
4590       // Add the mask index for the new shuffle vector.
4591       Ops.push_back(Idx + OpNo * NewElts);
4592     }
4593 
4594     if (UseBuildVector) {
4595       LLT EltTy = NarrowTy.getElementType();
4596       SmallVector<Register, 16> SVOps;
4597 
4598       // Extract the input elements by hand.
4599       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4600         // The mask element.  This indexes into the input.
4601         int Idx = Mask[FirstMaskIdx + MaskOffset];
4602 
4603         // The input vector this mask element indexes into.
4604         unsigned Input = (unsigned)Idx / NewElts;
4605 
4606         if (Input >= array_lengthof(Inputs)) {
4607           // The mask element is "undef" or indexes off the end of the input.
4608           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4609           continue;
4610         }
4611 
4612         // Turn the index into an offset from the start of the input vector.
4613         Idx -= Input * NewElts;
4614 
4615         // Extract the vector element by hand.
4616         SVOps.push_back(MIRBuilder
4617                             .buildExtractVectorElement(
4618                                 EltTy, Inputs[Input],
4619                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4620                             .getReg(0));
4621       }
4622 
4623       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4624       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4625     } else if (InputUsed[0] == -1U) {
4626       // No input vectors were used! The result is undefined.
4627       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4628     } else {
4629       Register Op0 = Inputs[InputUsed[0]];
4630       // If only one input was used, use an undefined vector for the other.
4631       Register Op1 = InputUsed[1] == -1U
4632                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4633                          : Inputs[InputUsed[1]];
4634       // At least one input vector was used. Create a new shuffle vector.
4635       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4636     }
4637 
4638     Ops.clear();
4639   }
4640 
4641   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4642   MI.eraseFromParent();
4643   return Legalized;
4644 }
4645 
4646 static unsigned getScalarOpcForReduction(unsigned Opc) {
4647   unsigned ScalarOpc;
4648   switch (Opc) {
4649   case TargetOpcode::G_VECREDUCE_FADD:
4650     ScalarOpc = TargetOpcode::G_FADD;
4651     break;
4652   case TargetOpcode::G_VECREDUCE_FMUL:
4653     ScalarOpc = TargetOpcode::G_FMUL;
4654     break;
4655   case TargetOpcode::G_VECREDUCE_FMAX:
4656     ScalarOpc = TargetOpcode::G_FMAXNUM;
4657     break;
4658   case TargetOpcode::G_VECREDUCE_FMIN:
4659     ScalarOpc = TargetOpcode::G_FMINNUM;
4660     break;
4661   case TargetOpcode::G_VECREDUCE_ADD:
4662     ScalarOpc = TargetOpcode::G_ADD;
4663     break;
4664   case TargetOpcode::G_VECREDUCE_MUL:
4665     ScalarOpc = TargetOpcode::G_MUL;
4666     break;
4667   case TargetOpcode::G_VECREDUCE_AND:
4668     ScalarOpc = TargetOpcode::G_AND;
4669     break;
4670   case TargetOpcode::G_VECREDUCE_OR:
4671     ScalarOpc = TargetOpcode::G_OR;
4672     break;
4673   case TargetOpcode::G_VECREDUCE_XOR:
4674     ScalarOpc = TargetOpcode::G_XOR;
4675     break;
4676   case TargetOpcode::G_VECREDUCE_SMAX:
4677     ScalarOpc = TargetOpcode::G_SMAX;
4678     break;
4679   case TargetOpcode::G_VECREDUCE_SMIN:
4680     ScalarOpc = TargetOpcode::G_SMIN;
4681     break;
4682   case TargetOpcode::G_VECREDUCE_UMAX:
4683     ScalarOpc = TargetOpcode::G_UMAX;
4684     break;
4685   case TargetOpcode::G_VECREDUCE_UMIN:
4686     ScalarOpc = TargetOpcode::G_UMIN;
4687     break;
4688   default:
4689     llvm_unreachable("Unhandled reduction");
4690   }
4691   return ScalarOpc;
4692 }
4693 
4694 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4695     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4696   unsigned Opc = MI.getOpcode();
4697   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4698          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4699          "Sequential reductions not expected");
4700 
4701   if (TypeIdx != 1)
4702     return UnableToLegalize;
4703 
4704   // The semantics of the normal non-sequential reductions allow us to freely
4705   // re-associate the operation.
4706   Register SrcReg = MI.getOperand(1).getReg();
4707   LLT SrcTy = MRI.getType(SrcReg);
4708   Register DstReg = MI.getOperand(0).getReg();
4709   LLT DstTy = MRI.getType(DstReg);
4710 
4711   if (NarrowTy.isVector() &&
4712       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4713     return UnableToLegalize;
4714 
4715   unsigned ScalarOpc = getScalarOpcForReduction(Opc);
4716   SmallVector<Register> SplitSrcs;
4717   // If NarrowTy is a scalar then we're being asked to scalarize.
4718   const unsigned NumParts =
4719       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4720                           : SrcTy.getNumElements();
4721 
4722   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4723   if (NarrowTy.isScalar()) {
4724     if (DstTy != NarrowTy)
4725       return UnableToLegalize; // FIXME: handle implicit extensions.
4726 
4727     if (isPowerOf2_32(NumParts)) {
4728       // Generate a tree of scalar operations to reduce the critical path.
4729       SmallVector<Register> PartialResults;
4730       unsigned NumPartsLeft = NumParts;
4731       while (NumPartsLeft > 1) {
4732         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4733           PartialResults.emplace_back(
4734               MIRBuilder
4735                   .buildInstr(ScalarOpc, {NarrowTy},
4736                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4737                   .getReg(0));
4738         }
4739         SplitSrcs = PartialResults;
4740         PartialResults.clear();
4741         NumPartsLeft = SplitSrcs.size();
4742       }
4743       assert(SplitSrcs.size() == 1);
4744       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4745       MI.eraseFromParent();
4746       return Legalized;
4747     }
4748     // If we can't generate a tree, then just do sequential operations.
4749     Register Acc = SplitSrcs[0];
4750     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4751       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4752                 .getReg(0);
4753     MIRBuilder.buildCopy(DstReg, Acc);
4754     MI.eraseFromParent();
4755     return Legalized;
4756   }
4757   SmallVector<Register> PartialReductions;
4758   for (unsigned Part = 0; Part < NumParts; ++Part) {
4759     PartialReductions.push_back(
4760         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4761   }
4762 
4763 
4764   // If the types involved are powers of 2, we can generate intermediate vector
4765   // ops, before generating a final reduction operation.
4766   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4767       isPowerOf2_32(NarrowTy.getNumElements())) {
4768     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4769   }
4770 
4771   Register Acc = PartialReductions[0];
4772   for (unsigned Part = 1; Part < NumParts; ++Part) {
4773     if (Part == NumParts - 1) {
4774       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4775                             {Acc, PartialReductions[Part]});
4776     } else {
4777       Acc = MIRBuilder
4778                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4779                 .getReg(0);
4780     }
4781   }
4782   MI.eraseFromParent();
4783   return Legalized;
4784 }
4785 
4786 LegalizerHelper::LegalizeResult
4787 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4788                                         LLT SrcTy, LLT NarrowTy,
4789                                         unsigned ScalarOpc) {
4790   SmallVector<Register> SplitSrcs;
4791   // Split the sources into NarrowTy size pieces.
4792   extractParts(SrcReg, NarrowTy,
4793                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4794   // We're going to do a tree reduction using vector operations until we have
4795   // one NarrowTy size value left.
4796   while (SplitSrcs.size() > 1) {
4797     SmallVector<Register> PartialRdxs;
4798     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4799       Register LHS = SplitSrcs[Idx];
4800       Register RHS = SplitSrcs[Idx + 1];
4801       // Create the intermediate vector op.
4802       Register Res =
4803           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4804       PartialRdxs.push_back(Res);
4805     }
4806     SplitSrcs = std::move(PartialRdxs);
4807   }
4808   // Finally generate the requested NarrowTy based reduction.
4809   Observer.changingInstr(MI);
4810   MI.getOperand(1).setReg(SplitSrcs[0]);
4811   Observer.changedInstr(MI);
4812   return Legalized;
4813 }
4814 
4815 LegalizerHelper::LegalizeResult
4816 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4817                                              const LLT HalfTy, const LLT AmtTy) {
4818 
4819   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4820   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4821   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4822 
4823   if (Amt.isNullValue()) {
4824     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4825     MI.eraseFromParent();
4826     return Legalized;
4827   }
4828 
4829   LLT NVT = HalfTy;
4830   unsigned NVTBits = HalfTy.getSizeInBits();
4831   unsigned VTBits = 2 * NVTBits;
4832 
4833   SrcOp Lo(Register(0)), Hi(Register(0));
4834   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4835     if (Amt.ugt(VTBits)) {
4836       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4837     } else if (Amt.ugt(NVTBits)) {
4838       Lo = MIRBuilder.buildConstant(NVT, 0);
4839       Hi = MIRBuilder.buildShl(NVT, InL,
4840                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4841     } else if (Amt == NVTBits) {
4842       Lo = MIRBuilder.buildConstant(NVT, 0);
4843       Hi = InL;
4844     } else {
4845       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4846       auto OrLHS =
4847           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4848       auto OrRHS = MIRBuilder.buildLShr(
4849           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4850       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4851     }
4852   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4853     if (Amt.ugt(VTBits)) {
4854       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4855     } else if (Amt.ugt(NVTBits)) {
4856       Lo = MIRBuilder.buildLShr(NVT, InH,
4857                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4858       Hi = MIRBuilder.buildConstant(NVT, 0);
4859     } else if (Amt == NVTBits) {
4860       Lo = InH;
4861       Hi = MIRBuilder.buildConstant(NVT, 0);
4862     } else {
4863       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4864 
4865       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4866       auto OrRHS = MIRBuilder.buildShl(
4867           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4868 
4869       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4870       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4871     }
4872   } else {
4873     if (Amt.ugt(VTBits)) {
4874       Hi = Lo = MIRBuilder.buildAShr(
4875           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4876     } else if (Amt.ugt(NVTBits)) {
4877       Lo = MIRBuilder.buildAShr(NVT, InH,
4878                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4879       Hi = MIRBuilder.buildAShr(NVT, InH,
4880                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4881     } else if (Amt == NVTBits) {
4882       Lo = InH;
4883       Hi = MIRBuilder.buildAShr(NVT, InH,
4884                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4885     } else {
4886       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4887 
4888       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4889       auto OrRHS = MIRBuilder.buildShl(
4890           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4891 
4892       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4893       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4894     }
4895   }
4896 
4897   MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4898   MI.eraseFromParent();
4899 
4900   return Legalized;
4901 }
4902 
4903 // TODO: Optimize if constant shift amount.
4904 LegalizerHelper::LegalizeResult
4905 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4906                                    LLT RequestedTy) {
4907   if (TypeIdx == 1) {
4908     Observer.changingInstr(MI);
4909     narrowScalarSrc(MI, RequestedTy, 2);
4910     Observer.changedInstr(MI);
4911     return Legalized;
4912   }
4913 
4914   Register DstReg = MI.getOperand(0).getReg();
4915   LLT DstTy = MRI.getType(DstReg);
4916   if (DstTy.isVector())
4917     return UnableToLegalize;
4918 
4919   Register Amt = MI.getOperand(2).getReg();
4920   LLT ShiftAmtTy = MRI.getType(Amt);
4921   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4922   if (DstEltSize % 2 != 0)
4923     return UnableToLegalize;
4924 
4925   // Ignore the input type. We can only go to exactly half the size of the
4926   // input. If that isn't small enough, the resulting pieces will be further
4927   // legalized.
4928   const unsigned NewBitSize = DstEltSize / 2;
4929   const LLT HalfTy = LLT::scalar(NewBitSize);
4930   const LLT CondTy = LLT::scalar(1);
4931 
4932   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
4933     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4934                                        ShiftAmtTy);
4935   }
4936 
4937   // TODO: Expand with known bits.
4938 
4939   // Handle the fully general expansion by an unknown amount.
4940   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4941 
4942   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4943   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4944   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4945 
4946   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4947   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4948 
4949   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4950   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4951   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4952 
4953   Register ResultRegs[2];
4954   switch (MI.getOpcode()) {
4955   case TargetOpcode::G_SHL: {
4956     // Short: ShAmt < NewBitSize
4957     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4958 
4959     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4960     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4961     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4962 
4963     // Long: ShAmt >= NewBitSize
4964     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4965     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4966 
4967     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4968     auto Hi = MIRBuilder.buildSelect(
4969         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4970 
4971     ResultRegs[0] = Lo.getReg(0);
4972     ResultRegs[1] = Hi.getReg(0);
4973     break;
4974   }
4975   case TargetOpcode::G_LSHR:
4976   case TargetOpcode::G_ASHR: {
4977     // Short: ShAmt < NewBitSize
4978     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4979 
4980     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4981     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4982     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4983 
4984     // Long: ShAmt >= NewBitSize
4985     MachineInstrBuilder HiL;
4986     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4987       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4988     } else {
4989       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4990       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4991     }
4992     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4993                                      {InH, AmtExcess});     // Lo from Hi part.
4994 
4995     auto Lo = MIRBuilder.buildSelect(
4996         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4997 
4998     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4999 
5000     ResultRegs[0] = Lo.getReg(0);
5001     ResultRegs[1] = Hi.getReg(0);
5002     break;
5003   }
5004   default:
5005     llvm_unreachable("not a shift");
5006   }
5007 
5008   MIRBuilder.buildMerge(DstReg, ResultRegs);
5009   MI.eraseFromParent();
5010   return Legalized;
5011 }
5012 
5013 LegalizerHelper::LegalizeResult
5014 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
5015                                        LLT MoreTy) {
5016   assert(TypeIdx == 0 && "Expecting only Idx 0");
5017 
5018   Observer.changingInstr(MI);
5019   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5020     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
5021     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
5022     moreElementsVectorSrc(MI, MoreTy, I);
5023   }
5024 
5025   MachineBasicBlock &MBB = *MI.getParent();
5026   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
5027   moreElementsVectorDst(MI, MoreTy, 0);
5028   Observer.changedInstr(MI);
5029   return Legalized;
5030 }
5031 
5032 LegalizerHelper::LegalizeResult
5033 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
5034                                     LLT MoreTy) {
5035   unsigned Opc = MI.getOpcode();
5036   switch (Opc) {
5037   case TargetOpcode::G_IMPLICIT_DEF:
5038   case TargetOpcode::G_LOAD: {
5039     if (TypeIdx != 0)
5040       return UnableToLegalize;
5041     Observer.changingInstr(MI);
5042     moreElementsVectorDst(MI, MoreTy, 0);
5043     Observer.changedInstr(MI);
5044     return Legalized;
5045   }
5046   case TargetOpcode::G_STORE:
5047     if (TypeIdx != 0)
5048       return UnableToLegalize;
5049     Observer.changingInstr(MI);
5050     moreElementsVectorSrc(MI, MoreTy, 0);
5051     Observer.changedInstr(MI);
5052     return Legalized;
5053   case TargetOpcode::G_AND:
5054   case TargetOpcode::G_OR:
5055   case TargetOpcode::G_XOR:
5056   case TargetOpcode::G_SMIN:
5057   case TargetOpcode::G_SMAX:
5058   case TargetOpcode::G_UMIN:
5059   case TargetOpcode::G_UMAX:
5060   case TargetOpcode::G_FMINNUM:
5061   case TargetOpcode::G_FMAXNUM:
5062   case TargetOpcode::G_FMINNUM_IEEE:
5063   case TargetOpcode::G_FMAXNUM_IEEE:
5064   case TargetOpcode::G_FMINIMUM:
5065   case TargetOpcode::G_FMAXIMUM: {
5066     Observer.changingInstr(MI);
5067     moreElementsVectorSrc(MI, MoreTy, 1);
5068     moreElementsVectorSrc(MI, MoreTy, 2);
5069     moreElementsVectorDst(MI, MoreTy, 0);
5070     Observer.changedInstr(MI);
5071     return Legalized;
5072   }
5073   case TargetOpcode::G_EXTRACT:
5074     if (TypeIdx != 1)
5075       return UnableToLegalize;
5076     Observer.changingInstr(MI);
5077     moreElementsVectorSrc(MI, MoreTy, 1);
5078     Observer.changedInstr(MI);
5079     return Legalized;
5080   case TargetOpcode::G_INSERT:
5081   case TargetOpcode::G_FREEZE:
5082     if (TypeIdx != 0)
5083       return UnableToLegalize;
5084     Observer.changingInstr(MI);
5085     moreElementsVectorSrc(MI, MoreTy, 1);
5086     moreElementsVectorDst(MI, MoreTy, 0);
5087     Observer.changedInstr(MI);
5088     return Legalized;
5089   case TargetOpcode::G_SELECT:
5090     if (TypeIdx != 0)
5091       return UnableToLegalize;
5092     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
5093       return UnableToLegalize;
5094 
5095     Observer.changingInstr(MI);
5096     moreElementsVectorSrc(MI, MoreTy, 2);
5097     moreElementsVectorSrc(MI, MoreTy, 3);
5098     moreElementsVectorDst(MI, MoreTy, 0);
5099     Observer.changedInstr(MI);
5100     return Legalized;
5101   case TargetOpcode::G_UNMERGE_VALUES: {
5102     if (TypeIdx != 1)
5103       return UnableToLegalize;
5104 
5105     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
5106     int NumDst = MI.getNumOperands() - 1;
5107     moreElementsVectorSrc(MI, MoreTy, NumDst);
5108 
5109     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
5110     for (int I = 0; I != NumDst; ++I)
5111       MIB.addDef(MI.getOperand(I).getReg());
5112 
5113     int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits();
5114     for (int I = NumDst; I != NewNumDst; ++I)
5115       MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
5116 
5117     MIB.addUse(MI.getOperand(NumDst).getReg());
5118     MI.eraseFromParent();
5119     return Legalized;
5120   }
5121   case TargetOpcode::G_PHI:
5122     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
5123   case TargetOpcode::G_SHUFFLE_VECTOR:
5124     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
5125   default:
5126     return UnableToLegalize;
5127   }
5128 }
5129 
5130 LegalizerHelper::LegalizeResult
5131 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
5132                                            unsigned int TypeIdx, LLT MoreTy) {
5133   if (TypeIdx != 0)
5134     return UnableToLegalize;
5135 
5136   Register DstReg = MI.getOperand(0).getReg();
5137   Register Src1Reg = MI.getOperand(1).getReg();
5138   Register Src2Reg = MI.getOperand(2).getReg();
5139   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5140   LLT DstTy = MRI.getType(DstReg);
5141   LLT Src1Ty = MRI.getType(Src1Reg);
5142   LLT Src2Ty = MRI.getType(Src2Reg);
5143   unsigned NumElts = DstTy.getNumElements();
5144   unsigned WidenNumElts = MoreTy.getNumElements();
5145 
5146   // Expect a canonicalized shuffle.
5147   if (DstTy != Src1Ty || DstTy != Src2Ty)
5148     return UnableToLegalize;
5149 
5150   moreElementsVectorSrc(MI, MoreTy, 1);
5151   moreElementsVectorSrc(MI, MoreTy, 2);
5152 
5153   // Adjust mask based on new input vector length.
5154   SmallVector<int, 16> NewMask;
5155   for (unsigned I = 0; I != NumElts; ++I) {
5156     int Idx = Mask[I];
5157     if (Idx < static_cast<int>(NumElts))
5158       NewMask.push_back(Idx);
5159     else
5160       NewMask.push_back(Idx - NumElts + WidenNumElts);
5161   }
5162   for (unsigned I = NumElts; I != WidenNumElts; ++I)
5163     NewMask.push_back(-1);
5164   moreElementsVectorDst(MI, MoreTy, 0);
5165   MIRBuilder.setInstrAndDebugLoc(MI);
5166   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5167                                 MI.getOperand(1).getReg(),
5168                                 MI.getOperand(2).getReg(), NewMask);
5169   MI.eraseFromParent();
5170   return Legalized;
5171 }
5172 
5173 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
5174                                         ArrayRef<Register> Src1Regs,
5175                                         ArrayRef<Register> Src2Regs,
5176                                         LLT NarrowTy) {
5177   MachineIRBuilder &B = MIRBuilder;
5178   unsigned SrcParts = Src1Regs.size();
5179   unsigned DstParts = DstRegs.size();
5180 
5181   unsigned DstIdx = 0; // Low bits of the result.
5182   Register FactorSum =
5183       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
5184   DstRegs[DstIdx] = FactorSum;
5185 
5186   unsigned CarrySumPrevDstIdx;
5187   SmallVector<Register, 4> Factors;
5188 
5189   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
5190     // Collect low parts of muls for DstIdx.
5191     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5192          i <= std::min(DstIdx, SrcParts - 1); ++i) {
5193       MachineInstrBuilder Mul =
5194           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
5195       Factors.push_back(Mul.getReg(0));
5196     }
5197     // Collect high parts of muls from previous DstIdx.
5198     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5199          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5200       MachineInstrBuilder Umulh =
5201           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5202       Factors.push_back(Umulh.getReg(0));
5203     }
5204     // Add CarrySum from additions calculated for previous DstIdx.
5205     if (DstIdx != 1) {
5206       Factors.push_back(CarrySumPrevDstIdx);
5207     }
5208 
5209     Register CarrySum;
5210     // Add all factors and accumulate all carries into CarrySum.
5211     if (DstIdx != DstParts - 1) {
5212       MachineInstrBuilder Uaddo =
5213           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5214       FactorSum = Uaddo.getReg(0);
5215       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5216       for (unsigned i = 2; i < Factors.size(); ++i) {
5217         MachineInstrBuilder Uaddo =
5218             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5219         FactorSum = Uaddo.getReg(0);
5220         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5221         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5222       }
5223     } else {
5224       // Since value for the next index is not calculated, neither is CarrySum.
5225       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5226       for (unsigned i = 2; i < Factors.size(); ++i)
5227         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5228     }
5229 
5230     CarrySumPrevDstIdx = CarrySum;
5231     DstRegs[DstIdx] = FactorSum;
5232     Factors.clear();
5233   }
5234 }
5235 
5236 LegalizerHelper::LegalizeResult
5237 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5238                                     LLT NarrowTy) {
5239   if (TypeIdx != 0)
5240     return UnableToLegalize;
5241 
5242   Register DstReg = MI.getOperand(0).getReg();
5243   LLT DstType = MRI.getType(DstReg);
5244   // FIXME: add support for vector types
5245   if (DstType.isVector())
5246     return UnableToLegalize;
5247 
5248   unsigned Opcode = MI.getOpcode();
5249   unsigned OpO, OpE, OpF;
5250   switch (Opcode) {
5251   case TargetOpcode::G_SADDO:
5252   case TargetOpcode::G_SADDE:
5253   case TargetOpcode::G_UADDO:
5254   case TargetOpcode::G_UADDE:
5255   case TargetOpcode::G_ADD:
5256     OpO = TargetOpcode::G_UADDO;
5257     OpE = TargetOpcode::G_UADDE;
5258     OpF = TargetOpcode::G_UADDE;
5259     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5260       OpF = TargetOpcode::G_SADDE;
5261     break;
5262   case TargetOpcode::G_SSUBO:
5263   case TargetOpcode::G_SSUBE:
5264   case TargetOpcode::G_USUBO:
5265   case TargetOpcode::G_USUBE:
5266   case TargetOpcode::G_SUB:
5267     OpO = TargetOpcode::G_USUBO;
5268     OpE = TargetOpcode::G_USUBE;
5269     OpF = TargetOpcode::G_USUBE;
5270     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5271       OpF = TargetOpcode::G_SSUBE;
5272     break;
5273   default:
5274     llvm_unreachable("Unexpected add/sub opcode!");
5275   }
5276 
5277   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5278   unsigned NumDefs = MI.getNumExplicitDefs();
5279   Register Src1 = MI.getOperand(NumDefs).getReg();
5280   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5281   Register CarryDst, CarryIn;
5282   if (NumDefs == 2)
5283     CarryDst = MI.getOperand(1).getReg();
5284   if (MI.getNumOperands() == NumDefs + 3)
5285     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5286 
5287   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5288   LLT LeftoverTy, DummyTy;
5289   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5290   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5291   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5292 
5293   int NarrowParts = Src1Regs.size();
5294   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5295     Src1Regs.push_back(Src1Left[I]);
5296     Src2Regs.push_back(Src2Left[I]);
5297   }
5298   DstRegs.reserve(Src1Regs.size());
5299 
5300   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5301     Register DstReg =
5302         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5303     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5304     // Forward the final carry-out to the destination register
5305     if (i == e - 1 && CarryDst)
5306       CarryOut = CarryDst;
5307 
5308     if (!CarryIn) {
5309       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5310                             {Src1Regs[i], Src2Regs[i]});
5311     } else if (i == e - 1) {
5312       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5313                             {Src1Regs[i], Src2Regs[i], CarryIn});
5314     } else {
5315       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5316                             {Src1Regs[i], Src2Regs[i], CarryIn});
5317     }
5318 
5319     DstRegs.push_back(DstReg);
5320     CarryIn = CarryOut;
5321   }
5322   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5323               makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5324               makeArrayRef(DstRegs).drop_front(NarrowParts));
5325 
5326   MI.eraseFromParent();
5327   return Legalized;
5328 }
5329 
5330 LegalizerHelper::LegalizeResult
5331 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5332   Register DstReg = MI.getOperand(0).getReg();
5333   Register Src1 = MI.getOperand(1).getReg();
5334   Register Src2 = MI.getOperand(2).getReg();
5335 
5336   LLT Ty = MRI.getType(DstReg);
5337   if (Ty.isVector())
5338     return UnableToLegalize;
5339 
5340   unsigned SrcSize = MRI.getType(Src1).getSizeInBits();
5341   unsigned DstSize = Ty.getSizeInBits();
5342   unsigned NarrowSize = NarrowTy.getSizeInBits();
5343   if (DstSize % NarrowSize != 0 || SrcSize % NarrowSize != 0)
5344     return UnableToLegalize;
5345 
5346   unsigned NumDstParts = DstSize / NarrowSize;
5347   unsigned NumSrcParts = SrcSize / NarrowSize;
5348   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5349   unsigned DstTmpParts = NumDstParts * (IsMulHigh ? 2 : 1);
5350 
5351   SmallVector<Register, 2> Src1Parts, Src2Parts;
5352   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5353   extractParts(Src1, NarrowTy, NumSrcParts, Src1Parts);
5354   extractParts(Src2, NarrowTy, NumSrcParts, Src2Parts);
5355   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5356 
5357   // Take only high half of registers if this is high mul.
5358   ArrayRef<Register> DstRegs(
5359       IsMulHigh ? &DstTmpRegs[DstTmpParts / 2] : &DstTmpRegs[0], NumDstParts);
5360   MIRBuilder.buildMerge(DstReg, DstRegs);
5361   MI.eraseFromParent();
5362   return Legalized;
5363 }
5364 
5365 LegalizerHelper::LegalizeResult
5366 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5367                                    LLT NarrowTy) {
5368   if (TypeIdx != 0)
5369     return UnableToLegalize;
5370 
5371   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5372 
5373   Register Src = MI.getOperand(1).getReg();
5374   LLT SrcTy = MRI.getType(Src);
5375 
5376   // If all finite floats fit into the narrowed integer type, we can just swap
5377   // out the result type. This is practically only useful for conversions from
5378   // half to at least 16-bits, so just handle the one case.
5379   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5380       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5381     return UnableToLegalize;
5382 
5383   Observer.changingInstr(MI);
5384   narrowScalarDst(MI, NarrowTy, 0,
5385                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5386   Observer.changedInstr(MI);
5387   return Legalized;
5388 }
5389 
5390 LegalizerHelper::LegalizeResult
5391 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5392                                      LLT NarrowTy) {
5393   if (TypeIdx != 1)
5394     return UnableToLegalize;
5395 
5396   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5397 
5398   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5399   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5400   // NarrowSize.
5401   if (SizeOp1 % NarrowSize != 0)
5402     return UnableToLegalize;
5403   int NumParts = SizeOp1 / NarrowSize;
5404 
5405   SmallVector<Register, 2> SrcRegs, DstRegs;
5406   SmallVector<uint64_t, 2> Indexes;
5407   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5408 
5409   Register OpReg = MI.getOperand(0).getReg();
5410   uint64_t OpStart = MI.getOperand(2).getImm();
5411   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5412   for (int i = 0; i < NumParts; ++i) {
5413     unsigned SrcStart = i * NarrowSize;
5414 
5415     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5416       // No part of the extract uses this subregister, ignore it.
5417       continue;
5418     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5419       // The entire subregister is extracted, forward the value.
5420       DstRegs.push_back(SrcRegs[i]);
5421       continue;
5422     }
5423 
5424     // OpSegStart is where this destination segment would start in OpReg if it
5425     // extended infinitely in both directions.
5426     int64_t ExtractOffset;
5427     uint64_t SegSize;
5428     if (OpStart < SrcStart) {
5429       ExtractOffset = 0;
5430       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5431     } else {
5432       ExtractOffset = OpStart - SrcStart;
5433       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5434     }
5435 
5436     Register SegReg = SrcRegs[i];
5437     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5438       // A genuine extract is needed.
5439       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5440       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5441     }
5442 
5443     DstRegs.push_back(SegReg);
5444   }
5445 
5446   Register DstReg = MI.getOperand(0).getReg();
5447   if (MRI.getType(DstReg).isVector())
5448     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5449   else if (DstRegs.size() > 1)
5450     MIRBuilder.buildMerge(DstReg, DstRegs);
5451   else
5452     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5453   MI.eraseFromParent();
5454   return Legalized;
5455 }
5456 
5457 LegalizerHelper::LegalizeResult
5458 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5459                                     LLT NarrowTy) {
5460   // FIXME: Don't know how to handle secondary types yet.
5461   if (TypeIdx != 0)
5462     return UnableToLegalize;
5463 
5464   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5465   SmallVector<uint64_t, 2> Indexes;
5466   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5467   LLT LeftoverTy;
5468   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5469                LeftoverRegs);
5470 
5471   for (Register Reg : LeftoverRegs)
5472     SrcRegs.push_back(Reg);
5473 
5474   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5475   Register OpReg = MI.getOperand(2).getReg();
5476   uint64_t OpStart = MI.getOperand(3).getImm();
5477   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5478   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5479     unsigned DstStart = I * NarrowSize;
5480 
5481     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5482       // The entire subregister is defined by this insert, forward the new
5483       // value.
5484       DstRegs.push_back(OpReg);
5485       continue;
5486     }
5487 
5488     Register SrcReg = SrcRegs[I];
5489     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5490       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5491       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5492       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5493     }
5494 
5495     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5496       // No part of the insert affects this subregister, forward the original.
5497       DstRegs.push_back(SrcReg);
5498       continue;
5499     }
5500 
5501     // OpSegStart is where this destination segment would start in OpReg if it
5502     // extended infinitely in both directions.
5503     int64_t ExtractOffset, InsertOffset;
5504     uint64_t SegSize;
5505     if (OpStart < DstStart) {
5506       InsertOffset = 0;
5507       ExtractOffset = DstStart - OpStart;
5508       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5509     } else {
5510       InsertOffset = OpStart - DstStart;
5511       ExtractOffset = 0;
5512       SegSize =
5513         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5514     }
5515 
5516     Register SegReg = OpReg;
5517     if (ExtractOffset != 0 || SegSize != OpSize) {
5518       // A genuine extract is needed.
5519       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5520       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5521     }
5522 
5523     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5524     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5525     DstRegs.push_back(DstReg);
5526   }
5527 
5528   uint64_t WideSize = DstRegs.size() * NarrowSize;
5529   Register DstReg = MI.getOperand(0).getReg();
5530   if (WideSize > RegTy.getSizeInBits()) {
5531     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5532     MIRBuilder.buildMerge(MergeReg, DstRegs);
5533     MIRBuilder.buildTrunc(DstReg, MergeReg);
5534   } else
5535     MIRBuilder.buildMerge(DstReg, DstRegs);
5536 
5537   MI.eraseFromParent();
5538   return Legalized;
5539 }
5540 
5541 LegalizerHelper::LegalizeResult
5542 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5543                                    LLT NarrowTy) {
5544   Register DstReg = MI.getOperand(0).getReg();
5545   LLT DstTy = MRI.getType(DstReg);
5546 
5547   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5548 
5549   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5550   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5551   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5552   LLT LeftoverTy;
5553   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5554                     Src0Regs, Src0LeftoverRegs))
5555     return UnableToLegalize;
5556 
5557   LLT Unused;
5558   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5559                     Src1Regs, Src1LeftoverRegs))
5560     llvm_unreachable("inconsistent extractParts result");
5561 
5562   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5563     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5564                                         {Src0Regs[I], Src1Regs[I]});
5565     DstRegs.push_back(Inst.getReg(0));
5566   }
5567 
5568   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5569     auto Inst = MIRBuilder.buildInstr(
5570       MI.getOpcode(),
5571       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5572     DstLeftoverRegs.push_back(Inst.getReg(0));
5573   }
5574 
5575   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5576               LeftoverTy, DstLeftoverRegs);
5577 
5578   MI.eraseFromParent();
5579   return Legalized;
5580 }
5581 
5582 LegalizerHelper::LegalizeResult
5583 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5584                                  LLT NarrowTy) {
5585   if (TypeIdx != 0)
5586     return UnableToLegalize;
5587 
5588   Register DstReg = MI.getOperand(0).getReg();
5589   Register SrcReg = MI.getOperand(1).getReg();
5590 
5591   LLT DstTy = MRI.getType(DstReg);
5592   if (DstTy.isVector())
5593     return UnableToLegalize;
5594 
5595   SmallVector<Register, 8> Parts;
5596   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5597   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5598   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5599 
5600   MI.eraseFromParent();
5601   return Legalized;
5602 }
5603 
5604 LegalizerHelper::LegalizeResult
5605 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5606                                     LLT NarrowTy) {
5607   if (TypeIdx != 0)
5608     return UnableToLegalize;
5609 
5610   Register CondReg = MI.getOperand(1).getReg();
5611   LLT CondTy = MRI.getType(CondReg);
5612   if (CondTy.isVector()) // TODO: Handle vselect
5613     return UnableToLegalize;
5614 
5615   Register DstReg = MI.getOperand(0).getReg();
5616   LLT DstTy = MRI.getType(DstReg);
5617 
5618   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5619   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5620   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5621   LLT LeftoverTy;
5622   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5623                     Src1Regs, Src1LeftoverRegs))
5624     return UnableToLegalize;
5625 
5626   LLT Unused;
5627   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5628                     Src2Regs, Src2LeftoverRegs))
5629     llvm_unreachable("inconsistent extractParts result");
5630 
5631   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5632     auto Select = MIRBuilder.buildSelect(NarrowTy,
5633                                          CondReg, Src1Regs[I], Src2Regs[I]);
5634     DstRegs.push_back(Select.getReg(0));
5635   }
5636 
5637   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5638     auto Select = MIRBuilder.buildSelect(
5639       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5640     DstLeftoverRegs.push_back(Select.getReg(0));
5641   }
5642 
5643   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5644               LeftoverTy, DstLeftoverRegs);
5645 
5646   MI.eraseFromParent();
5647   return Legalized;
5648 }
5649 
5650 LegalizerHelper::LegalizeResult
5651 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5652                                   LLT NarrowTy) {
5653   if (TypeIdx != 1)
5654     return UnableToLegalize;
5655 
5656   Register DstReg = MI.getOperand(0).getReg();
5657   Register SrcReg = MI.getOperand(1).getReg();
5658   LLT DstTy = MRI.getType(DstReg);
5659   LLT SrcTy = MRI.getType(SrcReg);
5660   unsigned NarrowSize = NarrowTy.getSizeInBits();
5661 
5662   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5663     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5664 
5665     MachineIRBuilder &B = MIRBuilder;
5666     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5667     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5668     auto C_0 = B.buildConstant(NarrowTy, 0);
5669     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5670                                 UnmergeSrc.getReg(1), C_0);
5671     auto LoCTLZ = IsUndef ?
5672       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5673       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5674     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5675     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5676     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5677     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5678 
5679     MI.eraseFromParent();
5680     return Legalized;
5681   }
5682 
5683   return UnableToLegalize;
5684 }
5685 
5686 LegalizerHelper::LegalizeResult
5687 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5688                                   LLT NarrowTy) {
5689   if (TypeIdx != 1)
5690     return UnableToLegalize;
5691 
5692   Register DstReg = MI.getOperand(0).getReg();
5693   Register SrcReg = MI.getOperand(1).getReg();
5694   LLT DstTy = MRI.getType(DstReg);
5695   LLT SrcTy = MRI.getType(SrcReg);
5696   unsigned NarrowSize = NarrowTy.getSizeInBits();
5697 
5698   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5699     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5700 
5701     MachineIRBuilder &B = MIRBuilder;
5702     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5703     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5704     auto C_0 = B.buildConstant(NarrowTy, 0);
5705     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5706                                 UnmergeSrc.getReg(0), C_0);
5707     auto HiCTTZ = IsUndef ?
5708       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5709       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5710     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5711     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5712     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5713     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5714 
5715     MI.eraseFromParent();
5716     return Legalized;
5717   }
5718 
5719   return UnableToLegalize;
5720 }
5721 
5722 LegalizerHelper::LegalizeResult
5723 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5724                                    LLT NarrowTy) {
5725   if (TypeIdx != 1)
5726     return UnableToLegalize;
5727 
5728   Register DstReg = MI.getOperand(0).getReg();
5729   LLT DstTy = MRI.getType(DstReg);
5730   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5731   unsigned NarrowSize = NarrowTy.getSizeInBits();
5732 
5733   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5734     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5735 
5736     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5737     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5738     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5739 
5740     MI.eraseFromParent();
5741     return Legalized;
5742   }
5743 
5744   return UnableToLegalize;
5745 }
5746 
5747 LegalizerHelper::LegalizeResult
5748 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5749   unsigned Opc = MI.getOpcode();
5750   const auto &TII = MIRBuilder.getTII();
5751   auto isSupported = [this](const LegalityQuery &Q) {
5752     auto QAction = LI.getAction(Q).Action;
5753     return QAction == Legal || QAction == Libcall || QAction == Custom;
5754   };
5755   switch (Opc) {
5756   default:
5757     return UnableToLegalize;
5758   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5759     // This trivially expands to CTLZ.
5760     Observer.changingInstr(MI);
5761     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5762     Observer.changedInstr(MI);
5763     return Legalized;
5764   }
5765   case TargetOpcode::G_CTLZ: {
5766     Register DstReg = MI.getOperand(0).getReg();
5767     Register SrcReg = MI.getOperand(1).getReg();
5768     LLT DstTy = MRI.getType(DstReg);
5769     LLT SrcTy = MRI.getType(SrcReg);
5770     unsigned Len = SrcTy.getSizeInBits();
5771 
5772     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5773       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5774       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5775       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5776       auto ICmp = MIRBuilder.buildICmp(
5777           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5778       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5779       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5780       MI.eraseFromParent();
5781       return Legalized;
5782     }
5783     // for now, we do this:
5784     // NewLen = NextPowerOf2(Len);
5785     // x = x | (x >> 1);
5786     // x = x | (x >> 2);
5787     // ...
5788     // x = x | (x >>16);
5789     // x = x | (x >>32); // for 64-bit input
5790     // Upto NewLen/2
5791     // return Len - popcount(x);
5792     //
5793     // Ref: "Hacker's Delight" by Henry Warren
5794     Register Op = SrcReg;
5795     unsigned NewLen = PowerOf2Ceil(Len);
5796     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5797       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5798       auto MIBOp = MIRBuilder.buildOr(
5799           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5800       Op = MIBOp.getReg(0);
5801     }
5802     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5803     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5804                         MIBPop);
5805     MI.eraseFromParent();
5806     return Legalized;
5807   }
5808   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5809     // This trivially expands to CTTZ.
5810     Observer.changingInstr(MI);
5811     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5812     Observer.changedInstr(MI);
5813     return Legalized;
5814   }
5815   case TargetOpcode::G_CTTZ: {
5816     Register DstReg = MI.getOperand(0).getReg();
5817     Register SrcReg = MI.getOperand(1).getReg();
5818     LLT DstTy = MRI.getType(DstReg);
5819     LLT SrcTy = MRI.getType(SrcReg);
5820 
5821     unsigned Len = SrcTy.getSizeInBits();
5822     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5823       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5824       // zero.
5825       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5826       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5827       auto ICmp = MIRBuilder.buildICmp(
5828           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5829       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5830       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5831       MI.eraseFromParent();
5832       return Legalized;
5833     }
5834     // for now, we use: { return popcount(~x & (x - 1)); }
5835     // unless the target has ctlz but not ctpop, in which case we use:
5836     // { return 32 - nlz(~x & (x-1)); }
5837     // Ref: "Hacker's Delight" by Henry Warren
5838     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5839     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5840     auto MIBTmp = MIRBuilder.buildAnd(
5841         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5842     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5843         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5844       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5845       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5846                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5847       MI.eraseFromParent();
5848       return Legalized;
5849     }
5850     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5851     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5852     return Legalized;
5853   }
5854   case TargetOpcode::G_CTPOP: {
5855     Register SrcReg = MI.getOperand(1).getReg();
5856     LLT Ty = MRI.getType(SrcReg);
5857     unsigned Size = Ty.getSizeInBits();
5858     MachineIRBuilder &B = MIRBuilder;
5859 
5860     // Count set bits in blocks of 2 bits. Default approach would be
5861     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5862     // We use following formula instead:
5863     // B2Count = val - { (val >> 1) & 0x55555555 }
5864     // since it gives same result in blocks of 2 with one instruction less.
5865     auto C_1 = B.buildConstant(Ty, 1);
5866     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5867     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5868     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5869     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5870     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5871 
5872     // In order to get count in blocks of 4 add values from adjacent block of 2.
5873     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5874     auto C_2 = B.buildConstant(Ty, 2);
5875     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5876     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5877     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5878     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5879     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5880     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5881 
5882     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5883     // addition since count value sits in range {0,...,8} and 4 bits are enough
5884     // to hold such binary values. After addition high 4 bits still hold count
5885     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5886     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5887     auto C_4 = B.buildConstant(Ty, 4);
5888     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5889     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5890     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5891     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5892     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5893 
5894     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5895     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5896     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5897     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5898     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5899 
5900     // Shift count result from 8 high bits to low bits.
5901     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5902     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5903 
5904     MI.eraseFromParent();
5905     return Legalized;
5906   }
5907   }
5908 }
5909 
5910 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5911 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5912                                         Register Reg, unsigned BW) {
5913   return matchUnaryPredicate(
5914       MRI, Reg,
5915       [=](const Constant *C) {
5916         // Null constant here means an undef.
5917         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5918         return !CI || CI->getValue().urem(BW) != 0;
5919       },
5920       /*AllowUndefs*/ true);
5921 }
5922 
5923 LegalizerHelper::LegalizeResult
5924 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5925   Register Dst = MI.getOperand(0).getReg();
5926   Register X = MI.getOperand(1).getReg();
5927   Register Y = MI.getOperand(2).getReg();
5928   Register Z = MI.getOperand(3).getReg();
5929   LLT Ty = MRI.getType(Dst);
5930   LLT ShTy = MRI.getType(Z);
5931 
5932   unsigned BW = Ty.getScalarSizeInBits();
5933 
5934   if (!isPowerOf2_32(BW))
5935     return UnableToLegalize;
5936 
5937   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5938   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5939 
5940   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5941     // fshl X, Y, Z -> fshr X, Y, -Z
5942     // fshr X, Y, Z -> fshl X, Y, -Z
5943     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5944     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5945   } else {
5946     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5947     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5948     auto One = MIRBuilder.buildConstant(ShTy, 1);
5949     if (IsFSHL) {
5950       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5951       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5952     } else {
5953       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5954       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5955     }
5956 
5957     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5958   }
5959 
5960   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5961   MI.eraseFromParent();
5962   return Legalized;
5963 }
5964 
5965 LegalizerHelper::LegalizeResult
5966 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5967   Register Dst = MI.getOperand(0).getReg();
5968   Register X = MI.getOperand(1).getReg();
5969   Register Y = MI.getOperand(2).getReg();
5970   Register Z = MI.getOperand(3).getReg();
5971   LLT Ty = MRI.getType(Dst);
5972   LLT ShTy = MRI.getType(Z);
5973 
5974   const unsigned BW = Ty.getScalarSizeInBits();
5975   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5976 
5977   Register ShX, ShY;
5978   Register ShAmt, InvShAmt;
5979 
5980   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5981   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5982     // fshl: X << C | Y >> (BW - C)
5983     // fshr: X << (BW - C) | Y >> C
5984     // where C = Z % BW is not zero
5985     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5986     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5987     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5988     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5989     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5990   } else {
5991     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5992     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5993     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5994     if (isPowerOf2_32(BW)) {
5995       // Z % BW -> Z & (BW - 1)
5996       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5997       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5998       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5999       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
6000     } else {
6001       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
6002       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
6003       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
6004     }
6005 
6006     auto One = MIRBuilder.buildConstant(ShTy, 1);
6007     if (IsFSHL) {
6008       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
6009       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
6010       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
6011     } else {
6012       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
6013       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
6014       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
6015     }
6016   }
6017 
6018   MIRBuilder.buildOr(Dst, ShX, ShY);
6019   MI.eraseFromParent();
6020   return Legalized;
6021 }
6022 
6023 LegalizerHelper::LegalizeResult
6024 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
6025   // These operations approximately do the following (while avoiding undefined
6026   // shifts by BW):
6027   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
6028   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
6029   Register Dst = MI.getOperand(0).getReg();
6030   LLT Ty = MRI.getType(Dst);
6031   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
6032 
6033   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6034   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6035 
6036   // TODO: Use smarter heuristic that accounts for vector legalization.
6037   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
6038     return lowerFunnelShiftAsShifts(MI);
6039 
6040   // This only works for powers of 2, fallback to shifts if it fails.
6041   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
6042   if (Result == UnableToLegalize)
6043     return lowerFunnelShiftAsShifts(MI);
6044   return Result;
6045 }
6046 
6047 LegalizerHelper::LegalizeResult
6048 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
6049   Register Dst = MI.getOperand(0).getReg();
6050   Register Src = MI.getOperand(1).getReg();
6051   Register Amt = MI.getOperand(2).getReg();
6052   LLT AmtTy = MRI.getType(Amt);
6053   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6054   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6055   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6056   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6057   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
6058   MI.eraseFromParent();
6059   return Legalized;
6060 }
6061 
6062 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
6063   Register Dst = MI.getOperand(0).getReg();
6064   Register Src = MI.getOperand(1).getReg();
6065   Register Amt = MI.getOperand(2).getReg();
6066   LLT DstTy = MRI.getType(Dst);
6067   LLT SrcTy = MRI.getType(Dst);
6068   LLT AmtTy = MRI.getType(Amt);
6069 
6070   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
6071   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6072 
6073   MIRBuilder.setInstrAndDebugLoc(MI);
6074 
6075   // If a rotate in the other direction is supported, use it.
6076   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6077   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
6078       isPowerOf2_32(EltSizeInBits))
6079     return lowerRotateWithReverseRotate(MI);
6080 
6081   // If a funnel shift is supported, use it.
6082   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6083   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6084   bool IsFShLegal = false;
6085   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
6086       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
6087     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
6088                                 Register R3) {
6089       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
6090       MI.eraseFromParent();
6091       return Legalized;
6092     };
6093     // If a funnel shift in the other direction is supported, use it.
6094     if (IsFShLegal) {
6095       return buildFunnelShift(FShOpc, Dst, Src, Amt);
6096     } else if (isPowerOf2_32(EltSizeInBits)) {
6097       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
6098       return buildFunnelShift(RevFsh, Dst, Src, Amt);
6099     }
6100   }
6101 
6102   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6103   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
6104   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
6105   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
6106   Register ShVal;
6107   Register RevShiftVal;
6108   if (isPowerOf2_32(EltSizeInBits)) {
6109     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6110     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6111     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6112     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
6113     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6114     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
6115     RevShiftVal =
6116         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
6117   } else {
6118     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6119     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6120     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
6121     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
6122     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6123     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
6124     auto One = MIRBuilder.buildConstant(AmtTy, 1);
6125     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
6126     RevShiftVal =
6127         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
6128   }
6129   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
6130   MI.eraseFromParent();
6131   return Legalized;
6132 }
6133 
6134 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6135 // representation.
6136 LegalizerHelper::LegalizeResult
6137 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
6138   Register Dst = MI.getOperand(0).getReg();
6139   Register Src = MI.getOperand(1).getReg();
6140   const LLT S64 = LLT::scalar(64);
6141   const LLT S32 = LLT::scalar(32);
6142   const LLT S1 = LLT::scalar(1);
6143 
6144   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
6145 
6146   // unsigned cul2f(ulong u) {
6147   //   uint lz = clz(u);
6148   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
6149   //   u = (u << lz) & 0x7fffffffffffffffUL;
6150   //   ulong t = u & 0xffffffffffUL;
6151   //   uint v = (e << 23) | (uint)(u >> 40);
6152   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6153   //   return as_float(v + r);
6154   // }
6155 
6156   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
6157   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
6158 
6159   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
6160 
6161   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
6162   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
6163 
6164   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
6165   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
6166 
6167   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
6168   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
6169 
6170   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
6171 
6172   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
6173   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
6174 
6175   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
6176   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
6177   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
6178 
6179   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
6180   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
6181   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
6182   auto One = MIRBuilder.buildConstant(S32, 1);
6183 
6184   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
6185   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
6186   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
6187   MIRBuilder.buildAdd(Dst, V, R);
6188 
6189   MI.eraseFromParent();
6190   return Legalized;
6191 }
6192 
6193 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
6194   Register Dst = MI.getOperand(0).getReg();
6195   Register Src = MI.getOperand(1).getReg();
6196   LLT DstTy = MRI.getType(Dst);
6197   LLT SrcTy = MRI.getType(Src);
6198 
6199   if (SrcTy == LLT::scalar(1)) {
6200     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6201     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6202     MIRBuilder.buildSelect(Dst, Src, True, False);
6203     MI.eraseFromParent();
6204     return Legalized;
6205   }
6206 
6207   if (SrcTy != LLT::scalar(64))
6208     return UnableToLegalize;
6209 
6210   if (DstTy == LLT::scalar(32)) {
6211     // TODO: SelectionDAG has several alternative expansions to port which may
6212     // be more reasonble depending on the available instructions. If a target
6213     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6214     // intermediate type, this is probably worse.
6215     return lowerU64ToF32BitOps(MI);
6216   }
6217 
6218   return UnableToLegalize;
6219 }
6220 
6221 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6222   Register Dst = MI.getOperand(0).getReg();
6223   Register Src = MI.getOperand(1).getReg();
6224   LLT DstTy = MRI.getType(Dst);
6225   LLT SrcTy = MRI.getType(Src);
6226 
6227   const LLT S64 = LLT::scalar(64);
6228   const LLT S32 = LLT::scalar(32);
6229   const LLT S1 = LLT::scalar(1);
6230 
6231   if (SrcTy == S1) {
6232     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6233     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6234     MIRBuilder.buildSelect(Dst, Src, True, False);
6235     MI.eraseFromParent();
6236     return Legalized;
6237   }
6238 
6239   if (SrcTy != S64)
6240     return UnableToLegalize;
6241 
6242   if (DstTy == S32) {
6243     // signed cl2f(long l) {
6244     //   long s = l >> 63;
6245     //   float r = cul2f((l + s) ^ s);
6246     //   return s ? -r : r;
6247     // }
6248     Register L = Src;
6249     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6250     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6251 
6252     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6253     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6254     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6255 
6256     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6257     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6258                                             MIRBuilder.buildConstant(S64, 0));
6259     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6260     MI.eraseFromParent();
6261     return Legalized;
6262   }
6263 
6264   return UnableToLegalize;
6265 }
6266 
6267 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6268   Register Dst = MI.getOperand(0).getReg();
6269   Register Src = MI.getOperand(1).getReg();
6270   LLT DstTy = MRI.getType(Dst);
6271   LLT SrcTy = MRI.getType(Src);
6272   const LLT S64 = LLT::scalar(64);
6273   const LLT S32 = LLT::scalar(32);
6274 
6275   if (SrcTy != S64 && SrcTy != S32)
6276     return UnableToLegalize;
6277   if (DstTy != S32 && DstTy != S64)
6278     return UnableToLegalize;
6279 
6280   // FPTOSI gives same result as FPTOUI for positive signed integers.
6281   // FPTOUI needs to deal with fp values that convert to unsigned integers
6282   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6283 
6284   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6285   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6286                                                 : APFloat::IEEEdouble(),
6287                     APInt::getZero(SrcTy.getSizeInBits()));
6288   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6289 
6290   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6291 
6292   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6293   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6294   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6295   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6296   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6297   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6298   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6299 
6300   const LLT S1 = LLT::scalar(1);
6301 
6302   MachineInstrBuilder FCMP =
6303       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6304   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6305 
6306   MI.eraseFromParent();
6307   return Legalized;
6308 }
6309 
6310 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6311   Register Dst = MI.getOperand(0).getReg();
6312   Register Src = MI.getOperand(1).getReg();
6313   LLT DstTy = MRI.getType(Dst);
6314   LLT SrcTy = MRI.getType(Src);
6315   const LLT S64 = LLT::scalar(64);
6316   const LLT S32 = LLT::scalar(32);
6317 
6318   // FIXME: Only f32 to i64 conversions are supported.
6319   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6320     return UnableToLegalize;
6321 
6322   // Expand f32 -> i64 conversion
6323   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6324   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6325 
6326   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6327 
6328   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6329   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6330 
6331   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6332   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6333 
6334   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6335                                            APInt::getSignMask(SrcEltBits));
6336   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6337   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6338   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6339   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6340 
6341   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6342   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6343   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6344 
6345   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6346   R = MIRBuilder.buildZExt(DstTy, R);
6347 
6348   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6349   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6350   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6351   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6352 
6353   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6354   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6355 
6356   const LLT S1 = LLT::scalar(1);
6357   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6358                                     S1, Exponent, ExponentLoBit);
6359 
6360   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6361 
6362   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6363   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6364 
6365   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6366 
6367   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6368                                           S1, Exponent, ZeroSrcTy);
6369 
6370   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6371   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6372 
6373   MI.eraseFromParent();
6374   return Legalized;
6375 }
6376 
6377 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6378 LegalizerHelper::LegalizeResult
6379 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6380   Register Dst = MI.getOperand(0).getReg();
6381   Register Src = MI.getOperand(1).getReg();
6382 
6383   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6384     return UnableToLegalize;
6385 
6386   const unsigned ExpMask = 0x7ff;
6387   const unsigned ExpBiasf64 = 1023;
6388   const unsigned ExpBiasf16 = 15;
6389   const LLT S32 = LLT::scalar(32);
6390   const LLT S1 = LLT::scalar(1);
6391 
6392   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6393   Register U = Unmerge.getReg(0);
6394   Register UH = Unmerge.getReg(1);
6395 
6396   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6397   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6398 
6399   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6400   // add the f16 bias (15) to get the biased exponent for the f16 format.
6401   E = MIRBuilder.buildAdd(
6402     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6403 
6404   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6405   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6406 
6407   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6408                                        MIRBuilder.buildConstant(S32, 0x1ff));
6409   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6410 
6411   auto Zero = MIRBuilder.buildConstant(S32, 0);
6412   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6413   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6414   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6415 
6416   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6417   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6418   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6419   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6420 
6421   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6422   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6423 
6424   // N = M | (E << 12);
6425   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6426   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6427 
6428   // B = clamp(1-E, 0, 13);
6429   auto One = MIRBuilder.buildConstant(S32, 1);
6430   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6431   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6432   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6433 
6434   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6435                                        MIRBuilder.buildConstant(S32, 0x1000));
6436 
6437   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6438   auto D0 = MIRBuilder.buildShl(S32, D, B);
6439 
6440   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6441                                              D0, SigSetHigh);
6442   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6443   D = MIRBuilder.buildOr(S32, D, D1);
6444 
6445   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6446   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6447 
6448   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6449   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6450 
6451   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6452                                        MIRBuilder.buildConstant(S32, 3));
6453   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6454 
6455   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6456                                        MIRBuilder.buildConstant(S32, 5));
6457   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6458 
6459   V1 = MIRBuilder.buildOr(S32, V0, V1);
6460   V = MIRBuilder.buildAdd(S32, V, V1);
6461 
6462   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6463                                        E, MIRBuilder.buildConstant(S32, 30));
6464   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6465                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6466 
6467   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6468                                          E, MIRBuilder.buildConstant(S32, 1039));
6469   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6470 
6471   // Extract the sign bit.
6472   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6473   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6474 
6475   // Insert the sign bit
6476   V = MIRBuilder.buildOr(S32, Sign, V);
6477 
6478   MIRBuilder.buildTrunc(Dst, V);
6479   MI.eraseFromParent();
6480   return Legalized;
6481 }
6482 
6483 LegalizerHelper::LegalizeResult
6484 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6485   Register Dst = MI.getOperand(0).getReg();
6486   Register Src = MI.getOperand(1).getReg();
6487 
6488   LLT DstTy = MRI.getType(Dst);
6489   LLT SrcTy = MRI.getType(Src);
6490   const LLT S64 = LLT::scalar(64);
6491   const LLT S16 = LLT::scalar(16);
6492 
6493   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6494     return lowerFPTRUNC_F64_TO_F16(MI);
6495 
6496   return UnableToLegalize;
6497 }
6498 
6499 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6500 // multiplication tree.
6501 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6502   Register Dst = MI.getOperand(0).getReg();
6503   Register Src0 = MI.getOperand(1).getReg();
6504   Register Src1 = MI.getOperand(2).getReg();
6505   LLT Ty = MRI.getType(Dst);
6506 
6507   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6508   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6509   MI.eraseFromParent();
6510   return Legalized;
6511 }
6512 
6513 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6514   switch (Opc) {
6515   case TargetOpcode::G_SMIN:
6516     return CmpInst::ICMP_SLT;
6517   case TargetOpcode::G_SMAX:
6518     return CmpInst::ICMP_SGT;
6519   case TargetOpcode::G_UMIN:
6520     return CmpInst::ICMP_ULT;
6521   case TargetOpcode::G_UMAX:
6522     return CmpInst::ICMP_UGT;
6523   default:
6524     llvm_unreachable("not in integer min/max");
6525   }
6526 }
6527 
6528 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6529   Register Dst = MI.getOperand(0).getReg();
6530   Register Src0 = MI.getOperand(1).getReg();
6531   Register Src1 = MI.getOperand(2).getReg();
6532 
6533   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6534   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6535 
6536   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6537   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6538 
6539   MI.eraseFromParent();
6540   return Legalized;
6541 }
6542 
6543 LegalizerHelper::LegalizeResult
6544 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6545   Register Dst = MI.getOperand(0).getReg();
6546   Register Src0 = MI.getOperand(1).getReg();
6547   Register Src1 = MI.getOperand(2).getReg();
6548 
6549   const LLT Src0Ty = MRI.getType(Src0);
6550   const LLT Src1Ty = MRI.getType(Src1);
6551 
6552   const int Src0Size = Src0Ty.getScalarSizeInBits();
6553   const int Src1Size = Src1Ty.getScalarSizeInBits();
6554 
6555   auto SignBitMask = MIRBuilder.buildConstant(
6556     Src0Ty, APInt::getSignMask(Src0Size));
6557 
6558   auto NotSignBitMask = MIRBuilder.buildConstant(
6559     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6560 
6561   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6562   Register And1;
6563   if (Src0Ty == Src1Ty) {
6564     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6565   } else if (Src0Size > Src1Size) {
6566     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6567     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6568     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6569     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6570   } else {
6571     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6572     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6573     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6574     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6575   }
6576 
6577   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6578   // constants are a nan and -0.0, but the final result should preserve
6579   // everything.
6580   unsigned Flags = MI.getFlags();
6581   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6582 
6583   MI.eraseFromParent();
6584   return Legalized;
6585 }
6586 
6587 LegalizerHelper::LegalizeResult
6588 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6589   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6590     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6591 
6592   Register Dst = MI.getOperand(0).getReg();
6593   Register Src0 = MI.getOperand(1).getReg();
6594   Register Src1 = MI.getOperand(2).getReg();
6595   LLT Ty = MRI.getType(Dst);
6596 
6597   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6598     // Insert canonicalizes if it's possible we need to quiet to get correct
6599     // sNaN behavior.
6600 
6601     // Note this must be done here, and not as an optimization combine in the
6602     // absence of a dedicate quiet-snan instruction as we're using an
6603     // omni-purpose G_FCANONICALIZE.
6604     if (!isKnownNeverSNaN(Src0, MRI))
6605       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6606 
6607     if (!isKnownNeverSNaN(Src1, MRI))
6608       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6609   }
6610 
6611   // If there are no nans, it's safe to simply replace this with the non-IEEE
6612   // version.
6613   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6614   MI.eraseFromParent();
6615   return Legalized;
6616 }
6617 
6618 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6619   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6620   Register DstReg = MI.getOperand(0).getReg();
6621   LLT Ty = MRI.getType(DstReg);
6622   unsigned Flags = MI.getFlags();
6623 
6624   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6625                                   Flags);
6626   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6627   MI.eraseFromParent();
6628   return Legalized;
6629 }
6630 
6631 LegalizerHelper::LegalizeResult
6632 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6633   Register DstReg = MI.getOperand(0).getReg();
6634   Register X = MI.getOperand(1).getReg();
6635   const unsigned Flags = MI.getFlags();
6636   const LLT Ty = MRI.getType(DstReg);
6637   const LLT CondTy = Ty.changeElementSize(1);
6638 
6639   // round(x) =>
6640   //  t = trunc(x);
6641   //  d = fabs(x - t);
6642   //  o = copysign(1.0f, x);
6643   //  return t + (d >= 0.5 ? o : 0.0);
6644 
6645   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6646 
6647   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6648   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6649   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6650   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6651   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6652   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6653 
6654   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6655                                   Flags);
6656   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6657 
6658   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6659 
6660   MI.eraseFromParent();
6661   return Legalized;
6662 }
6663 
6664 LegalizerHelper::LegalizeResult
6665 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6666   Register DstReg = MI.getOperand(0).getReg();
6667   Register SrcReg = MI.getOperand(1).getReg();
6668   unsigned Flags = MI.getFlags();
6669   LLT Ty = MRI.getType(DstReg);
6670   const LLT CondTy = Ty.changeElementSize(1);
6671 
6672   // result = trunc(src);
6673   // if (src < 0.0 && src != result)
6674   //   result += -1.0.
6675 
6676   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6677   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6678 
6679   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6680                                   SrcReg, Zero, Flags);
6681   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6682                                       SrcReg, Trunc, Flags);
6683   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6684   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6685 
6686   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6687   MI.eraseFromParent();
6688   return Legalized;
6689 }
6690 
6691 LegalizerHelper::LegalizeResult
6692 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6693   const unsigned NumOps = MI.getNumOperands();
6694   Register DstReg = MI.getOperand(0).getReg();
6695   Register Src0Reg = MI.getOperand(1).getReg();
6696   LLT DstTy = MRI.getType(DstReg);
6697   LLT SrcTy = MRI.getType(Src0Reg);
6698   unsigned PartSize = SrcTy.getSizeInBits();
6699 
6700   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6701   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6702 
6703   for (unsigned I = 2; I != NumOps; ++I) {
6704     const unsigned Offset = (I - 1) * PartSize;
6705 
6706     Register SrcReg = MI.getOperand(I).getReg();
6707     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6708 
6709     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6710       MRI.createGenericVirtualRegister(WideTy);
6711 
6712     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6713     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6714     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6715     ResultReg = NextResult;
6716   }
6717 
6718   if (DstTy.isPointer()) {
6719     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6720           DstTy.getAddressSpace())) {
6721       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6722       return UnableToLegalize;
6723     }
6724 
6725     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6726   }
6727 
6728   MI.eraseFromParent();
6729   return Legalized;
6730 }
6731 
6732 LegalizerHelper::LegalizeResult
6733 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6734   const unsigned NumDst = MI.getNumOperands() - 1;
6735   Register SrcReg = MI.getOperand(NumDst).getReg();
6736   Register Dst0Reg = MI.getOperand(0).getReg();
6737   LLT DstTy = MRI.getType(Dst0Reg);
6738   if (DstTy.isPointer())
6739     return UnableToLegalize; // TODO
6740 
6741   SrcReg = coerceToScalar(SrcReg);
6742   if (!SrcReg)
6743     return UnableToLegalize;
6744 
6745   // Expand scalarizing unmerge as bitcast to integer and shift.
6746   LLT IntTy = MRI.getType(SrcReg);
6747 
6748   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6749 
6750   const unsigned DstSize = DstTy.getSizeInBits();
6751   unsigned Offset = DstSize;
6752   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6753     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6754     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6755     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6756   }
6757 
6758   MI.eraseFromParent();
6759   return Legalized;
6760 }
6761 
6762 /// Lower a vector extract or insert by writing the vector to a stack temporary
6763 /// and reloading the element or vector.
6764 ///
6765 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6766 ///  =>
6767 ///  %stack_temp = G_FRAME_INDEX
6768 ///  G_STORE %vec, %stack_temp
6769 ///  %idx = clamp(%idx, %vec.getNumElements())
6770 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6771 ///  %dst = G_LOAD %element_ptr
6772 LegalizerHelper::LegalizeResult
6773 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6774   Register DstReg = MI.getOperand(0).getReg();
6775   Register SrcVec = MI.getOperand(1).getReg();
6776   Register InsertVal;
6777   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6778     InsertVal = MI.getOperand(2).getReg();
6779 
6780   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6781 
6782   LLT VecTy = MRI.getType(SrcVec);
6783   LLT EltTy = VecTy.getElementType();
6784   if (!EltTy.isByteSized()) { // Not implemented.
6785     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6786     return UnableToLegalize;
6787   }
6788 
6789   unsigned EltBytes = EltTy.getSizeInBytes();
6790   Align VecAlign = getStackTemporaryAlignment(VecTy);
6791   Align EltAlign;
6792 
6793   MachinePointerInfo PtrInfo;
6794   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6795                                         VecAlign, PtrInfo);
6796   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6797 
6798   // Get the pointer to the element, and be sure not to hit undefined behavior
6799   // if the index is out of bounds.
6800   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6801 
6802   int64_t IdxVal;
6803   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6804     int64_t Offset = IdxVal * EltBytes;
6805     PtrInfo = PtrInfo.getWithOffset(Offset);
6806     EltAlign = commonAlignment(VecAlign, Offset);
6807   } else {
6808     // We lose information with a variable offset.
6809     EltAlign = getStackTemporaryAlignment(EltTy);
6810     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6811   }
6812 
6813   if (InsertVal) {
6814     // Write the inserted element
6815     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6816 
6817     // Reload the whole vector.
6818     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6819   } else {
6820     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6821   }
6822 
6823   MI.eraseFromParent();
6824   return Legalized;
6825 }
6826 
6827 LegalizerHelper::LegalizeResult
6828 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6829   Register DstReg = MI.getOperand(0).getReg();
6830   Register Src0Reg = MI.getOperand(1).getReg();
6831   Register Src1Reg = MI.getOperand(2).getReg();
6832   LLT Src0Ty = MRI.getType(Src0Reg);
6833   LLT DstTy = MRI.getType(DstReg);
6834   LLT IdxTy = LLT::scalar(32);
6835 
6836   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6837 
6838   if (DstTy.isScalar()) {
6839     if (Src0Ty.isVector())
6840       return UnableToLegalize;
6841 
6842     // This is just a SELECT.
6843     assert(Mask.size() == 1 && "Expected a single mask element");
6844     Register Val;
6845     if (Mask[0] < 0 || Mask[0] > 1)
6846       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6847     else
6848       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6849     MIRBuilder.buildCopy(DstReg, Val);
6850     MI.eraseFromParent();
6851     return Legalized;
6852   }
6853 
6854   Register Undef;
6855   SmallVector<Register, 32> BuildVec;
6856   LLT EltTy = DstTy.getElementType();
6857 
6858   for (int Idx : Mask) {
6859     if (Idx < 0) {
6860       if (!Undef.isValid())
6861         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6862       BuildVec.push_back(Undef);
6863       continue;
6864     }
6865 
6866     if (Src0Ty.isScalar()) {
6867       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6868     } else {
6869       int NumElts = Src0Ty.getNumElements();
6870       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6871       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6872       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6873       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6874       BuildVec.push_back(Extract.getReg(0));
6875     }
6876   }
6877 
6878   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6879   MI.eraseFromParent();
6880   return Legalized;
6881 }
6882 
6883 LegalizerHelper::LegalizeResult
6884 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6885   const auto &MF = *MI.getMF();
6886   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6887   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6888     return UnableToLegalize;
6889 
6890   Register Dst = MI.getOperand(0).getReg();
6891   Register AllocSize = MI.getOperand(1).getReg();
6892   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6893 
6894   LLT PtrTy = MRI.getType(Dst);
6895   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6896 
6897   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6898   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6899   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6900 
6901   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6902   // have to generate an extra instruction to negate the alloc and then use
6903   // G_PTR_ADD to add the negative offset.
6904   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6905   if (Alignment > Align(1)) {
6906     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6907     AlignMask.negate();
6908     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6909     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6910   }
6911 
6912   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6913   MIRBuilder.buildCopy(SPReg, SPTmp);
6914   MIRBuilder.buildCopy(Dst, SPTmp);
6915 
6916   MI.eraseFromParent();
6917   return Legalized;
6918 }
6919 
6920 LegalizerHelper::LegalizeResult
6921 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6922   Register Dst = MI.getOperand(0).getReg();
6923   Register Src = MI.getOperand(1).getReg();
6924   unsigned Offset = MI.getOperand(2).getImm();
6925 
6926   LLT DstTy = MRI.getType(Dst);
6927   LLT SrcTy = MRI.getType(Src);
6928 
6929   if (DstTy.isScalar() &&
6930       (SrcTy.isScalar() ||
6931        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6932     LLT SrcIntTy = SrcTy;
6933     if (!SrcTy.isScalar()) {
6934       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6935       Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6936     }
6937 
6938     if (Offset == 0)
6939       MIRBuilder.buildTrunc(Dst, Src);
6940     else {
6941       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6942       auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6943       MIRBuilder.buildTrunc(Dst, Shr);
6944     }
6945 
6946     MI.eraseFromParent();
6947     return Legalized;
6948   }
6949 
6950   return UnableToLegalize;
6951 }
6952 
6953 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6954   Register Dst = MI.getOperand(0).getReg();
6955   Register Src = MI.getOperand(1).getReg();
6956   Register InsertSrc = MI.getOperand(2).getReg();
6957   uint64_t Offset = MI.getOperand(3).getImm();
6958 
6959   LLT DstTy = MRI.getType(Src);
6960   LLT InsertTy = MRI.getType(InsertSrc);
6961 
6962   if (InsertTy.isVector() ||
6963       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6964     return UnableToLegalize;
6965 
6966   const DataLayout &DL = MIRBuilder.getDataLayout();
6967   if ((DstTy.isPointer() &&
6968        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6969       (InsertTy.isPointer() &&
6970        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6971     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6972     return UnableToLegalize;
6973   }
6974 
6975   LLT IntDstTy = DstTy;
6976 
6977   if (!DstTy.isScalar()) {
6978     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6979     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6980   }
6981 
6982   if (!InsertTy.isScalar()) {
6983     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6984     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6985   }
6986 
6987   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6988   if (Offset != 0) {
6989     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6990     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6991   }
6992 
6993   APInt MaskVal = APInt::getBitsSetWithWrap(
6994       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6995 
6996   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6997   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6998   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6999 
7000   MIRBuilder.buildCast(Dst, Or);
7001   MI.eraseFromParent();
7002   return Legalized;
7003 }
7004 
7005 LegalizerHelper::LegalizeResult
7006 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
7007   Register Dst0 = MI.getOperand(0).getReg();
7008   Register Dst1 = MI.getOperand(1).getReg();
7009   Register LHS = MI.getOperand(2).getReg();
7010   Register RHS = MI.getOperand(3).getReg();
7011   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
7012 
7013   LLT Ty = MRI.getType(Dst0);
7014   LLT BoolTy = MRI.getType(Dst1);
7015 
7016   if (IsAdd)
7017     MIRBuilder.buildAdd(Dst0, LHS, RHS);
7018   else
7019     MIRBuilder.buildSub(Dst0, LHS, RHS);
7020 
7021   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
7022 
7023   auto Zero = MIRBuilder.buildConstant(Ty, 0);
7024 
7025   // For an addition, the result should be less than one of the operands (LHS)
7026   // if and only if the other operand (RHS) is negative, otherwise there will
7027   // be overflow.
7028   // For a subtraction, the result should be less than one of the operands
7029   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
7030   // otherwise there will be overflow.
7031   auto ResultLowerThanLHS =
7032       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
7033   auto ConditionRHS = MIRBuilder.buildICmp(
7034       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
7035 
7036   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
7037   MI.eraseFromParent();
7038   return Legalized;
7039 }
7040 
7041 LegalizerHelper::LegalizeResult
7042 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
7043   Register Res = MI.getOperand(0).getReg();
7044   Register LHS = MI.getOperand(1).getReg();
7045   Register RHS = MI.getOperand(2).getReg();
7046   LLT Ty = MRI.getType(Res);
7047   bool IsSigned;
7048   bool IsAdd;
7049   unsigned BaseOp;
7050   switch (MI.getOpcode()) {
7051   default:
7052     llvm_unreachable("unexpected addsat/subsat opcode");
7053   case TargetOpcode::G_UADDSAT:
7054     IsSigned = false;
7055     IsAdd = true;
7056     BaseOp = TargetOpcode::G_ADD;
7057     break;
7058   case TargetOpcode::G_SADDSAT:
7059     IsSigned = true;
7060     IsAdd = true;
7061     BaseOp = TargetOpcode::G_ADD;
7062     break;
7063   case TargetOpcode::G_USUBSAT:
7064     IsSigned = false;
7065     IsAdd = false;
7066     BaseOp = TargetOpcode::G_SUB;
7067     break;
7068   case TargetOpcode::G_SSUBSAT:
7069     IsSigned = true;
7070     IsAdd = false;
7071     BaseOp = TargetOpcode::G_SUB;
7072     break;
7073   }
7074 
7075   if (IsSigned) {
7076     // sadd.sat(a, b) ->
7077     //   hi = 0x7fffffff - smax(a, 0)
7078     //   lo = 0x80000000 - smin(a, 0)
7079     //   a + smin(smax(lo, b), hi)
7080     // ssub.sat(a, b) ->
7081     //   lo = smax(a, -1) - 0x7fffffff
7082     //   hi = smin(a, -1) - 0x80000000
7083     //   a - smin(smax(lo, b), hi)
7084     // TODO: AMDGPU can use a "median of 3" instruction here:
7085     //   a +/- med3(lo, b, hi)
7086     uint64_t NumBits = Ty.getScalarSizeInBits();
7087     auto MaxVal =
7088         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
7089     auto MinVal =
7090         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7091     MachineInstrBuilder Hi, Lo;
7092     if (IsAdd) {
7093       auto Zero = MIRBuilder.buildConstant(Ty, 0);
7094       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
7095       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
7096     } else {
7097       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
7098       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
7099                                MaxVal);
7100       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
7101                                MinVal);
7102     }
7103     auto RHSClamped =
7104         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
7105     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
7106   } else {
7107     // uadd.sat(a, b) -> a + umin(~a, b)
7108     // usub.sat(a, b) -> a - umin(a, b)
7109     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
7110     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
7111     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
7112   }
7113 
7114   MI.eraseFromParent();
7115   return Legalized;
7116 }
7117 
7118 LegalizerHelper::LegalizeResult
7119 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7120   Register Res = MI.getOperand(0).getReg();
7121   Register LHS = MI.getOperand(1).getReg();
7122   Register RHS = MI.getOperand(2).getReg();
7123   LLT Ty = MRI.getType(Res);
7124   LLT BoolTy = Ty.changeElementSize(1);
7125   bool IsSigned;
7126   bool IsAdd;
7127   unsigned OverflowOp;
7128   switch (MI.getOpcode()) {
7129   default:
7130     llvm_unreachable("unexpected addsat/subsat opcode");
7131   case TargetOpcode::G_UADDSAT:
7132     IsSigned = false;
7133     IsAdd = true;
7134     OverflowOp = TargetOpcode::G_UADDO;
7135     break;
7136   case TargetOpcode::G_SADDSAT:
7137     IsSigned = true;
7138     IsAdd = true;
7139     OverflowOp = TargetOpcode::G_SADDO;
7140     break;
7141   case TargetOpcode::G_USUBSAT:
7142     IsSigned = false;
7143     IsAdd = false;
7144     OverflowOp = TargetOpcode::G_USUBO;
7145     break;
7146   case TargetOpcode::G_SSUBSAT:
7147     IsSigned = true;
7148     IsAdd = false;
7149     OverflowOp = TargetOpcode::G_SSUBO;
7150     break;
7151   }
7152 
7153   auto OverflowRes =
7154       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7155   Register Tmp = OverflowRes.getReg(0);
7156   Register Ov = OverflowRes.getReg(1);
7157   MachineInstrBuilder Clamp;
7158   if (IsSigned) {
7159     // sadd.sat(a, b) ->
7160     //   {tmp, ov} = saddo(a, b)
7161     //   ov ? (tmp >>s 31) + 0x80000000 : r
7162     // ssub.sat(a, b) ->
7163     //   {tmp, ov} = ssubo(a, b)
7164     //   ov ? (tmp >>s 31) + 0x80000000 : r
7165     uint64_t NumBits = Ty.getScalarSizeInBits();
7166     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7167     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7168     auto MinVal =
7169         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7170     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7171   } else {
7172     // uadd.sat(a, b) ->
7173     //   {tmp, ov} = uaddo(a, b)
7174     //   ov ? 0xffffffff : tmp
7175     // usub.sat(a, b) ->
7176     //   {tmp, ov} = usubo(a, b)
7177     //   ov ? 0 : tmp
7178     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7179   }
7180   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7181 
7182   MI.eraseFromParent();
7183   return Legalized;
7184 }
7185 
7186 LegalizerHelper::LegalizeResult
7187 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7188   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7189           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7190          "Expected shlsat opcode!");
7191   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7192   Register Res = MI.getOperand(0).getReg();
7193   Register LHS = MI.getOperand(1).getReg();
7194   Register RHS = MI.getOperand(2).getReg();
7195   LLT Ty = MRI.getType(Res);
7196   LLT BoolTy = Ty.changeElementSize(1);
7197 
7198   unsigned BW = Ty.getScalarSizeInBits();
7199   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7200   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7201                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7202 
7203   MachineInstrBuilder SatVal;
7204   if (IsSigned) {
7205     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7206     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7207     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7208                                     MIRBuilder.buildConstant(Ty, 0));
7209     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7210   } else {
7211     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7212   }
7213   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7214   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7215 
7216   MI.eraseFromParent();
7217   return Legalized;
7218 }
7219 
7220 LegalizerHelper::LegalizeResult
7221 LegalizerHelper::lowerBswap(MachineInstr &MI) {
7222   Register Dst = MI.getOperand(0).getReg();
7223   Register Src = MI.getOperand(1).getReg();
7224   const LLT Ty = MRI.getType(Src);
7225   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7226   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7227 
7228   // Swap most and least significant byte, set remaining bytes in Res to zero.
7229   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7230   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7231   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7232   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7233 
7234   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7235   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7236     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7237     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7238     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7239     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7240     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7241     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7242     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7243     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7244     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7245     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7246     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7247     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7248   }
7249   Res.getInstr()->getOperand(0).setReg(Dst);
7250 
7251   MI.eraseFromParent();
7252   return Legalized;
7253 }
7254 
7255 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7256 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7257                                  MachineInstrBuilder Src, APInt Mask) {
7258   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7259   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7260   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7261   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7262   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7263   return B.buildOr(Dst, LHS, RHS);
7264 }
7265 
7266 LegalizerHelper::LegalizeResult
7267 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7268   Register Dst = MI.getOperand(0).getReg();
7269   Register Src = MI.getOperand(1).getReg();
7270   const LLT Ty = MRI.getType(Src);
7271   unsigned Size = Ty.getSizeInBits();
7272 
7273   MachineInstrBuilder BSWAP =
7274       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7275 
7276   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7277   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7278   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7279   MachineInstrBuilder Swap4 =
7280       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7281 
7282   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7283   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7284   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7285   MachineInstrBuilder Swap2 =
7286       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7287 
7288   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7289   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7290   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7291   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7292 
7293   MI.eraseFromParent();
7294   return Legalized;
7295 }
7296 
7297 LegalizerHelper::LegalizeResult
7298 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7299   MachineFunction &MF = MIRBuilder.getMF();
7300 
7301   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7302   int NameOpIdx = IsRead ? 1 : 0;
7303   int ValRegIndex = IsRead ? 0 : 1;
7304 
7305   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7306   const LLT Ty = MRI.getType(ValReg);
7307   const MDString *RegStr = cast<MDString>(
7308     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7309 
7310   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7311   if (!PhysReg.isValid())
7312     return UnableToLegalize;
7313 
7314   if (IsRead)
7315     MIRBuilder.buildCopy(ValReg, PhysReg);
7316   else
7317     MIRBuilder.buildCopy(PhysReg, ValReg);
7318 
7319   MI.eraseFromParent();
7320   return Legalized;
7321 }
7322 
7323 LegalizerHelper::LegalizeResult
7324 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7325   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7326   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7327   Register Result = MI.getOperand(0).getReg();
7328   LLT OrigTy = MRI.getType(Result);
7329   auto SizeInBits = OrigTy.getScalarSizeInBits();
7330   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7331 
7332   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7333   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7334   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7335   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7336 
7337   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7338   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7339   MIRBuilder.buildTrunc(Result, Shifted);
7340 
7341   MI.eraseFromParent();
7342   return Legalized;
7343 }
7344 
7345 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7346   // Implement vector G_SELECT in terms of XOR, AND, OR.
7347   Register DstReg = MI.getOperand(0).getReg();
7348   Register MaskReg = MI.getOperand(1).getReg();
7349   Register Op1Reg = MI.getOperand(2).getReg();
7350   Register Op2Reg = MI.getOperand(3).getReg();
7351   LLT DstTy = MRI.getType(DstReg);
7352   LLT MaskTy = MRI.getType(MaskReg);
7353   LLT Op1Ty = MRI.getType(Op1Reg);
7354   if (!DstTy.isVector())
7355     return UnableToLegalize;
7356 
7357   // Vector selects can have a scalar predicate. If so, splat into a vector and
7358   // finish for later legalization attempts to try again.
7359   if (MaskTy.isScalar()) {
7360     Register MaskElt = MaskReg;
7361     if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
7362       MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
7363     // Generate a vector splat idiom to be pattern matched later.
7364     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7365     Observer.changingInstr(MI);
7366     MI.getOperand(1).setReg(ShufSplat.getReg(0));
7367     Observer.changedInstr(MI);
7368     return Legalized;
7369   }
7370 
7371   if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
7372     return UnableToLegalize;
7373   }
7374 
7375   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7376   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7377   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7378   MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7379   MI.eraseFromParent();
7380   return Legalized;
7381 }
7382 
7383 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7384   // Split DIVREM into individual instructions.
7385   unsigned Opcode = MI.getOpcode();
7386 
7387   MIRBuilder.buildInstr(
7388       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7389                                         : TargetOpcode::G_UDIV,
7390       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7391   MIRBuilder.buildInstr(
7392       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7393                                         : TargetOpcode::G_UREM,
7394       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7395   MI.eraseFromParent();
7396   return Legalized;
7397 }
7398 
7399 LegalizerHelper::LegalizeResult
7400 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7401   // Expand %res = G_ABS %a into:
7402   // %v1 = G_ASHR %a, scalar_size-1
7403   // %v2 = G_ADD %a, %v1
7404   // %res = G_XOR %v2, %v1
7405   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7406   Register OpReg = MI.getOperand(1).getReg();
7407   auto ShiftAmt =
7408       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7409   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7410   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7411   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7412   MI.eraseFromParent();
7413   return Legalized;
7414 }
7415 
7416 LegalizerHelper::LegalizeResult
7417 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7418   // Expand %res = G_ABS %a into:
7419   // %v1 = G_CONSTANT 0
7420   // %v2 = G_SUB %v1, %a
7421   // %res = G_SMAX %a, %v2
7422   Register SrcReg = MI.getOperand(1).getReg();
7423   LLT Ty = MRI.getType(SrcReg);
7424   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7425   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7426   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7427   MI.eraseFromParent();
7428   return Legalized;
7429 }
7430 
7431 LegalizerHelper::LegalizeResult
7432 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
7433   Register SrcReg = MI.getOperand(1).getReg();
7434   LLT SrcTy = MRI.getType(SrcReg);
7435   LLT DstTy = MRI.getType(SrcReg);
7436 
7437   // The source could be a scalar if the IR type was <1 x sN>.
7438   if (SrcTy.isScalar()) {
7439     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
7440       return UnableToLegalize; // FIXME: handle extension.
7441     // This can be just a plain copy.
7442     Observer.changingInstr(MI);
7443     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
7444     Observer.changedInstr(MI);
7445     return Legalized;
7446   }
7447   return UnableToLegalize;;
7448 }
7449 
7450 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
7451   // On Darwin, -Os means optimize for size without hurting performance, so
7452   // only really optimize for size when -Oz (MinSize) is used.
7453   if (MF.getTarget().getTargetTriple().isOSDarwin())
7454     return MF.getFunction().hasMinSize();
7455   return MF.getFunction().hasOptSize();
7456 }
7457 
7458 // Returns a list of types to use for memory op lowering in MemOps. A partial
7459 // port of findOptimalMemOpLowering in TargetLowering.
7460 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
7461                                           unsigned Limit, const MemOp &Op,
7462                                           unsigned DstAS, unsigned SrcAS,
7463                                           const AttributeList &FuncAttributes,
7464                                           const TargetLowering &TLI) {
7465   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
7466     return false;
7467 
7468   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
7469 
7470   if (Ty == LLT()) {
7471     // Use the largest scalar type whose alignment constraints are satisfied.
7472     // We only need to check DstAlign here as SrcAlign is always greater or
7473     // equal to DstAlign (or zero).
7474     Ty = LLT::scalar(64);
7475     if (Op.isFixedDstAlign())
7476       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
7477              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
7478         Ty = LLT::scalar(Ty.getSizeInBytes());
7479     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
7480     // FIXME: check for the largest legal type we can load/store to.
7481   }
7482 
7483   unsigned NumMemOps = 0;
7484   uint64_t Size = Op.size();
7485   while (Size) {
7486     unsigned TySize = Ty.getSizeInBytes();
7487     while (TySize > Size) {
7488       // For now, only use non-vector load / store's for the left-over pieces.
7489       LLT NewTy = Ty;
7490       // FIXME: check for mem op safety and legality of the types. Not all of
7491       // SDAGisms map cleanly to GISel concepts.
7492       if (NewTy.isVector())
7493         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
7494       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
7495       unsigned NewTySize = NewTy.getSizeInBytes();
7496       assert(NewTySize > 0 && "Could not find appropriate type");
7497 
7498       // If the new LLT cannot cover all of the remaining bits, then consider
7499       // issuing a (or a pair of) unaligned and overlapping load / store.
7500       bool Fast;
7501       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
7502       MVT VT = getMVTForLLT(Ty);
7503       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
7504           TLI.allowsMisalignedMemoryAccesses(
7505               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
7506               MachineMemOperand::MONone, &Fast) &&
7507           Fast)
7508         TySize = Size;
7509       else {
7510         Ty = NewTy;
7511         TySize = NewTySize;
7512       }
7513     }
7514 
7515     if (++NumMemOps > Limit)
7516       return false;
7517 
7518     MemOps.push_back(Ty);
7519     Size -= TySize;
7520   }
7521 
7522   return true;
7523 }
7524 
7525 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
7526   if (Ty.isVector())
7527     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
7528                                 Ty.getNumElements());
7529   return IntegerType::get(C, Ty.getSizeInBits());
7530 }
7531 
7532 // Get a vectorized representation of the memset value operand, GISel edition.
7533 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
7534   MachineRegisterInfo &MRI = *MIB.getMRI();
7535   unsigned NumBits = Ty.getScalarSizeInBits();
7536   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7537   if (!Ty.isVector() && ValVRegAndVal) {
7538     APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
7539     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
7540     return MIB.buildConstant(Ty, SplatVal).getReg(0);
7541   }
7542 
7543   // Extend the byte value to the larger type, and then multiply by a magic
7544   // value 0x010101... in order to replicate it across every byte.
7545   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
7546   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
7547     return MIB.buildConstant(Ty, 0).getReg(0);
7548   }
7549 
7550   LLT ExtType = Ty.getScalarType();
7551   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
7552   if (NumBits > 8) {
7553     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
7554     auto MagicMI = MIB.buildConstant(ExtType, Magic);
7555     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
7556   }
7557 
7558   // For vector types create a G_BUILD_VECTOR.
7559   if (Ty.isVector())
7560     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
7561 
7562   return Val;
7563 }
7564 
7565 LegalizerHelper::LegalizeResult
7566 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
7567                              uint64_t KnownLen, Align Alignment,
7568                              bool IsVolatile) {
7569   auto &MF = *MI.getParent()->getParent();
7570   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7571   auto &DL = MF.getDataLayout();
7572   LLVMContext &C = MF.getFunction().getContext();
7573 
7574   assert(KnownLen != 0 && "Have a zero length memset length!");
7575 
7576   bool DstAlignCanChange = false;
7577   MachineFrameInfo &MFI = MF.getFrameInfo();
7578   bool OptSize = shouldLowerMemFuncForSize(MF);
7579 
7580   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7581   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7582     DstAlignCanChange = true;
7583 
7584   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
7585   std::vector<LLT> MemOps;
7586 
7587   const auto &DstMMO = **MI.memoperands_begin();
7588   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7589 
7590   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7591   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
7592 
7593   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
7594                                      MemOp::Set(KnownLen, DstAlignCanChange,
7595                                                 Alignment,
7596                                                 /*IsZeroMemset=*/IsZeroVal,
7597                                                 /*IsVolatile=*/IsVolatile),
7598                                      DstPtrInfo.getAddrSpace(), ~0u,
7599                                      MF.getFunction().getAttributes(), TLI))
7600     return UnableToLegalize;
7601 
7602   if (DstAlignCanChange) {
7603     // Get an estimate of the type from the LLT.
7604     Type *IRTy = getTypeForLLT(MemOps[0], C);
7605     Align NewAlign = DL.getABITypeAlign(IRTy);
7606     if (NewAlign > Alignment) {
7607       Alignment = NewAlign;
7608       unsigned FI = FIDef->getOperand(1).getIndex();
7609       // Give the stack frame object a larger alignment if needed.
7610       if (MFI.getObjectAlign(FI) < Alignment)
7611         MFI.setObjectAlignment(FI, Alignment);
7612     }
7613   }
7614 
7615   MachineIRBuilder MIB(MI);
7616   // Find the largest store and generate the bit pattern for it.
7617   LLT LargestTy = MemOps[0];
7618   for (unsigned i = 1; i < MemOps.size(); i++)
7619     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
7620       LargestTy = MemOps[i];
7621 
7622   // The memset stored value is always defined as an s8, so in order to make it
7623   // work with larger store types we need to repeat the bit pattern across the
7624   // wider type.
7625   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
7626 
7627   if (!MemSetValue)
7628     return UnableToLegalize;
7629 
7630   // Generate the stores. For each store type in the list, we generate the
7631   // matching store of that type to the destination address.
7632   LLT PtrTy = MRI.getType(Dst);
7633   unsigned DstOff = 0;
7634   unsigned Size = KnownLen;
7635   for (unsigned I = 0; I < MemOps.size(); I++) {
7636     LLT Ty = MemOps[I];
7637     unsigned TySize = Ty.getSizeInBytes();
7638     if (TySize > Size) {
7639       // Issuing an unaligned load / store pair that overlaps with the previous
7640       // pair. Adjust the offset accordingly.
7641       assert(I == MemOps.size() - 1 && I != 0);
7642       DstOff -= TySize - Size;
7643     }
7644 
7645     // If this store is smaller than the largest store see whether we can get
7646     // the smaller value for free with a truncate.
7647     Register Value = MemSetValue;
7648     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
7649       MVT VT = getMVTForLLT(Ty);
7650       MVT LargestVT = getMVTForLLT(LargestTy);
7651       if (!LargestTy.isVector() && !Ty.isVector() &&
7652           TLI.isTruncateFree(LargestVT, VT))
7653         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
7654       else
7655         Value = getMemsetValue(Val, Ty, MIB);
7656       if (!Value)
7657         return UnableToLegalize;
7658     }
7659 
7660     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
7661 
7662     Register Ptr = Dst;
7663     if (DstOff != 0) {
7664       auto Offset =
7665           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
7666       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7667     }
7668 
7669     MIB.buildStore(Value, Ptr, *StoreMMO);
7670     DstOff += Ty.getSizeInBytes();
7671     Size -= TySize;
7672   }
7673 
7674   MI.eraseFromParent();
7675   return Legalized;
7676 }
7677 
7678 LegalizerHelper::LegalizeResult
7679 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
7680   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7681 
7682   Register Dst = MI.getOperand(0).getReg();
7683   Register Src = MI.getOperand(1).getReg();
7684   Register Len = MI.getOperand(2).getReg();
7685 
7686   const auto *MMOIt = MI.memoperands_begin();
7687   const MachineMemOperand *MemOp = *MMOIt;
7688   bool IsVolatile = MemOp->isVolatile();
7689 
7690   // See if this is a constant length copy
7691   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7692   // FIXME: support dynamically sized G_MEMCPY_INLINE
7693   assert(LenVRegAndVal.hasValue() &&
7694          "inline memcpy with dynamic size is not yet supported");
7695   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7696   if (KnownLen == 0) {
7697     MI.eraseFromParent();
7698     return Legalized;
7699   }
7700 
7701   const auto &DstMMO = **MI.memoperands_begin();
7702   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7703   Align DstAlign = DstMMO.getBaseAlign();
7704   Align SrcAlign = SrcMMO.getBaseAlign();
7705 
7706   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7707                            IsVolatile);
7708 }
7709 
7710 LegalizerHelper::LegalizeResult
7711 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
7712                                    uint64_t KnownLen, Align DstAlign,
7713                                    Align SrcAlign, bool IsVolatile) {
7714   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7715   return lowerMemcpy(MI, Dst, Src, KnownLen,
7716                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
7717                      IsVolatile);
7718 }
7719 
7720 LegalizerHelper::LegalizeResult
7721 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
7722                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
7723                              Align SrcAlign, bool IsVolatile) {
7724   auto &MF = *MI.getParent()->getParent();
7725   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7726   auto &DL = MF.getDataLayout();
7727   LLVMContext &C = MF.getFunction().getContext();
7728 
7729   assert(KnownLen != 0 && "Have a zero length memcpy length!");
7730 
7731   bool DstAlignCanChange = false;
7732   MachineFrameInfo &MFI = MF.getFrameInfo();
7733   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7734 
7735   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7736   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7737     DstAlignCanChange = true;
7738 
7739   // FIXME: infer better src pointer alignment like SelectionDAG does here.
7740   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
7741   // if the memcpy is in a tail call position.
7742 
7743   std::vector<LLT> MemOps;
7744 
7745   const auto &DstMMO = **MI.memoperands_begin();
7746   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7747   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7748   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7749 
7750   if (!findGISelOptimalMemOpLowering(
7751           MemOps, Limit,
7752           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7753                       IsVolatile),
7754           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7755           MF.getFunction().getAttributes(), TLI))
7756     return UnableToLegalize;
7757 
7758   if (DstAlignCanChange) {
7759     // Get an estimate of the type from the LLT.
7760     Type *IRTy = getTypeForLLT(MemOps[0], C);
7761     Align NewAlign = DL.getABITypeAlign(IRTy);
7762 
7763     // Don't promote to an alignment that would require dynamic stack
7764     // realignment.
7765     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7766     if (!TRI->hasStackRealignment(MF))
7767       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7768         NewAlign = NewAlign / 2;
7769 
7770     if (NewAlign > Alignment) {
7771       Alignment = NewAlign;
7772       unsigned FI = FIDef->getOperand(1).getIndex();
7773       // Give the stack frame object a larger alignment if needed.
7774       if (MFI.getObjectAlign(FI) < Alignment)
7775         MFI.setObjectAlignment(FI, Alignment);
7776     }
7777   }
7778 
7779   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
7780 
7781   MachineIRBuilder MIB(MI);
7782   // Now we need to emit a pair of load and stores for each of the types we've
7783   // collected. I.e. for each type, generate a load from the source pointer of
7784   // that type width, and then generate a corresponding store to the dest buffer
7785   // of that value loaded. This can result in a sequence of loads and stores
7786   // mixed types, depending on what the target specifies as good types to use.
7787   unsigned CurrOffset = 0;
7788   LLT PtrTy = MRI.getType(Src);
7789   unsigned Size = KnownLen;
7790   for (auto CopyTy : MemOps) {
7791     // Issuing an unaligned load / store pair  that overlaps with the previous
7792     // pair. Adjust the offset accordingly.
7793     if (CopyTy.getSizeInBytes() > Size)
7794       CurrOffset -= CopyTy.getSizeInBytes() - Size;
7795 
7796     // Construct MMOs for the accesses.
7797     auto *LoadMMO =
7798         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7799     auto *StoreMMO =
7800         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7801 
7802     // Create the load.
7803     Register LoadPtr = Src;
7804     Register Offset;
7805     if (CurrOffset != 0) {
7806       Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
7807                    .getReg(0);
7808       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
7809     }
7810     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
7811 
7812     // Create the store.
7813     Register StorePtr =
7814         CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7815     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
7816     CurrOffset += CopyTy.getSizeInBytes();
7817     Size -= CopyTy.getSizeInBytes();
7818   }
7819 
7820   MI.eraseFromParent();
7821   return Legalized;
7822 }
7823 
7824 LegalizerHelper::LegalizeResult
7825 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
7826                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
7827                               bool IsVolatile) {
7828   auto &MF = *MI.getParent()->getParent();
7829   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7830   auto &DL = MF.getDataLayout();
7831   LLVMContext &C = MF.getFunction().getContext();
7832 
7833   assert(KnownLen != 0 && "Have a zero length memmove length!");
7834 
7835   bool DstAlignCanChange = false;
7836   MachineFrameInfo &MFI = MF.getFrameInfo();
7837   bool OptSize = shouldLowerMemFuncForSize(MF);
7838   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7839 
7840   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7841   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7842     DstAlignCanChange = true;
7843 
7844   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
7845   std::vector<LLT> MemOps;
7846 
7847   const auto &DstMMO = **MI.memoperands_begin();
7848   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7849   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7850   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7851 
7852   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
7853   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
7854   // same thing here.
7855   if (!findGISelOptimalMemOpLowering(
7856           MemOps, Limit,
7857           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7858                       /*IsVolatile*/ true),
7859           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7860           MF.getFunction().getAttributes(), TLI))
7861     return UnableToLegalize;
7862 
7863   if (DstAlignCanChange) {
7864     // Get an estimate of the type from the LLT.
7865     Type *IRTy = getTypeForLLT(MemOps[0], C);
7866     Align NewAlign = DL.getABITypeAlign(IRTy);
7867 
7868     // Don't promote to an alignment that would require dynamic stack
7869     // realignment.
7870     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7871     if (!TRI->hasStackRealignment(MF))
7872       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7873         NewAlign = NewAlign / 2;
7874 
7875     if (NewAlign > Alignment) {
7876       Alignment = NewAlign;
7877       unsigned FI = FIDef->getOperand(1).getIndex();
7878       // Give the stack frame object a larger alignment if needed.
7879       if (MFI.getObjectAlign(FI) < Alignment)
7880         MFI.setObjectAlignment(FI, Alignment);
7881     }
7882   }
7883 
7884   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
7885 
7886   MachineIRBuilder MIB(MI);
7887   // Memmove requires that we perform the loads first before issuing the stores.
7888   // Apart from that, this loop is pretty much doing the same thing as the
7889   // memcpy codegen function.
7890   unsigned CurrOffset = 0;
7891   LLT PtrTy = MRI.getType(Src);
7892   SmallVector<Register, 16> LoadVals;
7893   for (auto CopyTy : MemOps) {
7894     // Construct MMO for the load.
7895     auto *LoadMMO =
7896         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7897 
7898     // Create the load.
7899     Register LoadPtr = Src;
7900     if (CurrOffset != 0) {
7901       auto Offset =
7902           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
7903       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
7904     }
7905     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
7906     CurrOffset += CopyTy.getSizeInBytes();
7907   }
7908 
7909   CurrOffset = 0;
7910   for (unsigned I = 0; I < MemOps.size(); ++I) {
7911     LLT CopyTy = MemOps[I];
7912     // Now store the values loaded.
7913     auto *StoreMMO =
7914         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7915 
7916     Register StorePtr = Dst;
7917     if (CurrOffset != 0) {
7918       auto Offset =
7919           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
7920       StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7921     }
7922     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
7923     CurrOffset += CopyTy.getSizeInBytes();
7924   }
7925   MI.eraseFromParent();
7926   return Legalized;
7927 }
7928 
7929 LegalizerHelper::LegalizeResult
7930 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
7931   const unsigned Opc = MI.getOpcode();
7932   // This combine is fairly complex so it's not written with a separate
7933   // matcher function.
7934   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
7935           Opc == TargetOpcode::G_MEMSET) &&
7936          "Expected memcpy like instruction");
7937 
7938   auto MMOIt = MI.memoperands_begin();
7939   const MachineMemOperand *MemOp = *MMOIt;
7940 
7941   Align DstAlign = MemOp->getBaseAlign();
7942   Align SrcAlign;
7943   Register Dst = MI.getOperand(0).getReg();
7944   Register Src = MI.getOperand(1).getReg();
7945   Register Len = MI.getOperand(2).getReg();
7946 
7947   if (Opc != TargetOpcode::G_MEMSET) {
7948     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
7949     MemOp = *(++MMOIt);
7950     SrcAlign = MemOp->getBaseAlign();
7951   }
7952 
7953   // See if this is a constant length copy
7954   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7955   if (!LenVRegAndVal)
7956     return UnableToLegalize;
7957   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7958 
7959   if (KnownLen == 0) {
7960     MI.eraseFromParent();
7961     return Legalized;
7962   }
7963 
7964   bool IsVolatile = MemOp->isVolatile();
7965   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
7966     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7967                              IsVolatile);
7968 
7969   // Don't try to optimize volatile.
7970   if (IsVolatile)
7971     return UnableToLegalize;
7972 
7973   if (MaxLen && KnownLen > MaxLen)
7974     return UnableToLegalize;
7975 
7976   if (Opc == TargetOpcode::G_MEMCPY) {
7977     auto &MF = *MI.getParent()->getParent();
7978     const auto &TLI = *MF.getSubtarget().getTargetLowering();
7979     bool OptSize = shouldLowerMemFuncForSize(MF);
7980     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
7981     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
7982                        IsVolatile);
7983   }
7984   if (Opc == TargetOpcode::G_MEMMOVE)
7985     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
7986   if (Opc == TargetOpcode::G_MEMSET)
7987     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
7988   return UnableToLegalize;
7989 }
7990