1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/TargetFrameLowering.h"
24 #include "llvm/CodeGen/TargetInstrInfo.h"
25 #include "llvm/CodeGen/TargetLowering.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/Target/TargetMachine.h"
33 
34 #define DEBUG_TYPE "legalizer"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace MIPatternMatch;
39 
40 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
41 ///
42 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
43 /// with any leftover piece as type \p LeftoverTy
44 ///
45 /// Returns -1 in the first element of the pair if the breakdown is not
46 /// satisfiable.
47 static std::pair<int, int>
48 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
49   assert(!LeftoverTy.isValid() && "this is an out argument");
50 
51   unsigned Size = OrigTy.getSizeInBits();
52   unsigned NarrowSize = NarrowTy.getSizeInBits();
53   unsigned NumParts = Size / NarrowSize;
54   unsigned LeftoverSize = Size - NumParts * NarrowSize;
55   assert(Size > NarrowSize);
56 
57   if (LeftoverSize == 0)
58     return {NumParts, 0};
59 
60   if (NarrowTy.isVector()) {
61     unsigned EltSize = OrigTy.getScalarSizeInBits();
62     if (LeftoverSize % EltSize != 0)
63       return {-1, -1};
64     LeftoverTy = LLT::scalarOrVector(
65         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
66   } else {
67     LeftoverTy = LLT::scalar(LeftoverSize);
68   }
69 
70   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
71   return std::make_pair(NumParts, NumLeftover);
72 }
73 
74 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
75 
76   if (!Ty.isScalar())
77     return nullptr;
78 
79   switch (Ty.getSizeInBits()) {
80   case 16:
81     return Type::getHalfTy(Ctx);
82   case 32:
83     return Type::getFloatTy(Ctx);
84   case 64:
85     return Type::getDoubleTy(Ctx);
86   case 80:
87     return Type::getX86_FP80Ty(Ctx);
88   case 128:
89     return Type::getFP128Ty(Ctx);
90   default:
91     return nullptr;
92   }
93 }
94 
95 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
96                                  GISelChangeObserver &Observer,
97                                  MachineIRBuilder &Builder)
98     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
99       LI(*MF.getSubtarget().getLegalizerInfo()),
100       TLI(*MF.getSubtarget().getTargetLowering()) { }
101 
102 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
103                                  GISelChangeObserver &Observer,
104                                  MachineIRBuilder &B)
105   : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
106     TLI(*MF.getSubtarget().getTargetLowering()) { }
107 
108 LegalizerHelper::LegalizeResult
109 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
110                                    LostDebugLocObserver &LocObserver) {
111   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
112 
113   MIRBuilder.setInstrAndDebugLoc(MI);
114 
115   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
116       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
117     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
118   auto Step = LI.getAction(MI, MRI);
119   switch (Step.Action) {
120   case Legal:
121     LLVM_DEBUG(dbgs() << ".. Already legal\n");
122     return AlreadyLegal;
123   case Libcall:
124     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
125     return libcall(MI, LocObserver);
126   case NarrowScalar:
127     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
128     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
129   case WidenScalar:
130     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
131     return widenScalar(MI, Step.TypeIdx, Step.NewType);
132   case Bitcast:
133     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
134     return bitcast(MI, Step.TypeIdx, Step.NewType);
135   case Lower:
136     LLVM_DEBUG(dbgs() << ".. Lower\n");
137     return lower(MI, Step.TypeIdx, Step.NewType);
138   case FewerElements:
139     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
140     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
141   case MoreElements:
142     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
143     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
144   case Custom:
145     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
146     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
147   default:
148     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
149     return UnableToLegalize;
150   }
151 }
152 
153 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
154                                    SmallVectorImpl<Register> &VRegs) {
155   for (int i = 0; i < NumParts; ++i)
156     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
157   MIRBuilder.buildUnmerge(VRegs, Reg);
158 }
159 
160 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
161                                    LLT MainTy, LLT &LeftoverTy,
162                                    SmallVectorImpl<Register> &VRegs,
163                                    SmallVectorImpl<Register> &LeftoverRegs) {
164   assert(!LeftoverTy.isValid() && "this is an out argument");
165 
166   unsigned RegSize = RegTy.getSizeInBits();
167   unsigned MainSize = MainTy.getSizeInBits();
168   unsigned NumParts = RegSize / MainSize;
169   unsigned LeftoverSize = RegSize - NumParts * MainSize;
170 
171   // Use an unmerge when possible.
172   if (LeftoverSize == 0) {
173     for (unsigned I = 0; I < NumParts; ++I)
174       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
175     MIRBuilder.buildUnmerge(VRegs, Reg);
176     return true;
177   }
178 
179   // Perform irregular split. Leftover is last element of RegPieces.
180   if (MainTy.isVector()) {
181     SmallVector<Register, 8> RegPieces;
182     extractVectorParts(Reg, MainTy.getNumElements(), RegPieces);
183     for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
184       VRegs.push_back(RegPieces[i]);
185     LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
186     LeftoverTy = MRI.getType(LeftoverRegs[0]);
187     return true;
188   }
189 
190   LeftoverTy = LLT::scalar(LeftoverSize);
191   // For irregular sizes, extract the individual parts.
192   for (unsigned I = 0; I != NumParts; ++I) {
193     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
194     VRegs.push_back(NewReg);
195     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
196   }
197 
198   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
199        Offset += LeftoverSize) {
200     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
201     LeftoverRegs.push_back(NewReg);
202     MIRBuilder.buildExtract(NewReg, Reg, Offset);
203   }
204 
205   return true;
206 }
207 
208 void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts,
209                                          SmallVectorImpl<Register> &VRegs) {
210   LLT RegTy = MRI.getType(Reg);
211   assert(RegTy.isVector() && "Expected a vector type");
212 
213   LLT EltTy = RegTy.getElementType();
214   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
215   unsigned RegNumElts = RegTy.getNumElements();
216   unsigned LeftoverNumElts = RegNumElts % NumElts;
217   unsigned NumNarrowTyPieces = RegNumElts / NumElts;
218 
219   // Perfect split without leftover
220   if (LeftoverNumElts == 0)
221     return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs);
222 
223   // Irregular split. Provide direct access to all elements for artifact
224   // combiner using unmerge to elements. Then build vectors with NumElts
225   // elements. Remaining element(s) will be (used to build vector) Leftover.
226   SmallVector<Register, 8> Elts;
227   extractParts(Reg, EltTy, RegNumElts, Elts);
228 
229   unsigned Offset = 0;
230   // Requested sub-vectors of NarrowTy.
231   for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
232     ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
233     VRegs.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
234   }
235 
236   // Leftover element(s).
237   if (LeftoverNumElts == 1) {
238     VRegs.push_back(Elts[Offset]);
239   } else {
240     LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
241     ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
242     VRegs.push_back(MIRBuilder.buildMerge(LeftoverTy, Pieces).getReg(0));
243   }
244 }
245 
246 void LegalizerHelper::insertParts(Register DstReg,
247                                   LLT ResultTy, LLT PartTy,
248                                   ArrayRef<Register> PartRegs,
249                                   LLT LeftoverTy,
250                                   ArrayRef<Register> LeftoverRegs) {
251   if (!LeftoverTy.isValid()) {
252     assert(LeftoverRegs.empty());
253 
254     if (!ResultTy.isVector()) {
255       MIRBuilder.buildMerge(DstReg, PartRegs);
256       return;
257     }
258 
259     if (PartTy.isVector())
260       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
261     else
262       MIRBuilder.buildBuildVector(DstReg, PartRegs);
263     return;
264   }
265 
266   // Merge sub-vectors with different number of elements and insert into DstReg.
267   if (ResultTy.isVector()) {
268     assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
269     SmallVector<Register, 8> AllRegs;
270     for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
271       AllRegs.push_back(Reg);
272     return mergeMixedSubvectors(DstReg, AllRegs);
273   }
274 
275   SmallVector<Register> GCDRegs;
276   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
277   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
278     extractGCDType(GCDRegs, GCDTy, PartReg);
279   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
280   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
281 }
282 
283 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
284                                        Register Reg) {
285   LLT Ty = MRI.getType(Reg);
286   SmallVector<Register, 8> RegElts;
287   extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts);
288   Elts.append(RegElts);
289 }
290 
291 /// Merge \p PartRegs with different types into \p DstReg.
292 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
293                                            ArrayRef<Register> PartRegs) {
294   SmallVector<Register, 8> AllElts;
295   for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
296     appendVectorElts(AllElts, PartRegs[i]);
297 
298   Register Leftover = PartRegs[PartRegs.size() - 1];
299   if (MRI.getType(Leftover).isScalar())
300     AllElts.push_back(Leftover);
301   else
302     appendVectorElts(AllElts, Leftover);
303 
304   MIRBuilder.buildMerge(DstReg, AllElts);
305 }
306 
307 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
308 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
309                               const MachineInstr &MI) {
310   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
311 
312   const int StartIdx = Regs.size();
313   const int NumResults = MI.getNumOperands() - 1;
314   Regs.resize(Regs.size() + NumResults);
315   for (int I = 0; I != NumResults; ++I)
316     Regs[StartIdx + I] = MI.getOperand(I).getReg();
317 }
318 
319 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
320                                      LLT GCDTy, Register SrcReg) {
321   LLT SrcTy = MRI.getType(SrcReg);
322   if (SrcTy == GCDTy) {
323     // If the source already evenly divides the result type, we don't need to do
324     // anything.
325     Parts.push_back(SrcReg);
326   } else {
327     // Need to split into common type sized pieces.
328     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
329     getUnmergeResults(Parts, *Unmerge);
330   }
331 }
332 
333 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
334                                     LLT NarrowTy, Register SrcReg) {
335   LLT SrcTy = MRI.getType(SrcReg);
336   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
337   extractGCDType(Parts, GCDTy, SrcReg);
338   return GCDTy;
339 }
340 
341 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
342                                          SmallVectorImpl<Register> &VRegs,
343                                          unsigned PadStrategy) {
344   LLT LCMTy = getLCMType(DstTy, NarrowTy);
345 
346   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
347   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
348   int NumOrigSrc = VRegs.size();
349 
350   Register PadReg;
351 
352   // Get a value we can use to pad the source value if the sources won't evenly
353   // cover the result type.
354   if (NumOrigSrc < NumParts * NumSubParts) {
355     if (PadStrategy == TargetOpcode::G_ZEXT)
356       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
357     else if (PadStrategy == TargetOpcode::G_ANYEXT)
358       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
359     else {
360       assert(PadStrategy == TargetOpcode::G_SEXT);
361 
362       // Shift the sign bit of the low register through the high register.
363       auto ShiftAmt =
364         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
365       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
366     }
367   }
368 
369   // Registers for the final merge to be produced.
370   SmallVector<Register, 4> Remerge(NumParts);
371 
372   // Registers needed for intermediate merges, which will be merged into a
373   // source for Remerge.
374   SmallVector<Register, 4> SubMerge(NumSubParts);
375 
376   // Once we've fully read off the end of the original source bits, we can reuse
377   // the same high bits for remaining padding elements.
378   Register AllPadReg;
379 
380   // Build merges to the LCM type to cover the original result type.
381   for (int I = 0; I != NumParts; ++I) {
382     bool AllMergePartsArePadding = true;
383 
384     // Build the requested merges to the requested type.
385     for (int J = 0; J != NumSubParts; ++J) {
386       int Idx = I * NumSubParts + J;
387       if (Idx >= NumOrigSrc) {
388         SubMerge[J] = PadReg;
389         continue;
390       }
391 
392       SubMerge[J] = VRegs[Idx];
393 
394       // There are meaningful bits here we can't reuse later.
395       AllMergePartsArePadding = false;
396     }
397 
398     // If we've filled up a complete piece with padding bits, we can directly
399     // emit the natural sized constant if applicable, rather than a merge of
400     // smaller constants.
401     if (AllMergePartsArePadding && !AllPadReg) {
402       if (PadStrategy == TargetOpcode::G_ANYEXT)
403         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
404       else if (PadStrategy == TargetOpcode::G_ZEXT)
405         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
406 
407       // If this is a sign extension, we can't materialize a trivial constant
408       // with the right type and have to produce a merge.
409     }
410 
411     if (AllPadReg) {
412       // Avoid creating additional instructions if we're just adding additional
413       // copies of padding bits.
414       Remerge[I] = AllPadReg;
415       continue;
416     }
417 
418     if (NumSubParts == 1)
419       Remerge[I] = SubMerge[0];
420     else
421       Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
422 
423     // In the sign extend padding case, re-use the first all-signbit merge.
424     if (AllMergePartsArePadding && !AllPadReg)
425       AllPadReg = Remerge[I];
426   }
427 
428   VRegs = std::move(Remerge);
429   return LCMTy;
430 }
431 
432 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
433                                                ArrayRef<Register> RemergeRegs) {
434   LLT DstTy = MRI.getType(DstReg);
435 
436   // Create the merge to the widened source, and extract the relevant bits into
437   // the result.
438 
439   if (DstTy == LCMTy) {
440     MIRBuilder.buildMerge(DstReg, RemergeRegs);
441     return;
442   }
443 
444   auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
445   if (DstTy.isScalar() && LCMTy.isScalar()) {
446     MIRBuilder.buildTrunc(DstReg, Remerge);
447     return;
448   }
449 
450   if (LCMTy.isVector()) {
451     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
452     SmallVector<Register, 8> UnmergeDefs(NumDefs);
453     UnmergeDefs[0] = DstReg;
454     for (unsigned I = 1; I != NumDefs; ++I)
455       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
456 
457     MIRBuilder.buildUnmerge(UnmergeDefs,
458                             MIRBuilder.buildMerge(LCMTy, RemergeRegs));
459     return;
460   }
461 
462   llvm_unreachable("unhandled case");
463 }
464 
465 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
466 #define RTLIBCASE_INT(LibcallPrefix)                                           \
467   do {                                                                         \
468     switch (Size) {                                                            \
469     case 32:                                                                   \
470       return RTLIB::LibcallPrefix##32;                                         \
471     case 64:                                                                   \
472       return RTLIB::LibcallPrefix##64;                                         \
473     case 128:                                                                  \
474       return RTLIB::LibcallPrefix##128;                                        \
475     default:                                                                   \
476       llvm_unreachable("unexpected size");                                     \
477     }                                                                          \
478   } while (0)
479 
480 #define RTLIBCASE(LibcallPrefix)                                               \
481   do {                                                                         \
482     switch (Size) {                                                            \
483     case 32:                                                                   \
484       return RTLIB::LibcallPrefix##32;                                         \
485     case 64:                                                                   \
486       return RTLIB::LibcallPrefix##64;                                         \
487     case 80:                                                                   \
488       return RTLIB::LibcallPrefix##80;                                         \
489     case 128:                                                                  \
490       return RTLIB::LibcallPrefix##128;                                        \
491     default:                                                                   \
492       llvm_unreachable("unexpected size");                                     \
493     }                                                                          \
494   } while (0)
495 
496   switch (Opcode) {
497   case TargetOpcode::G_SDIV:
498     RTLIBCASE_INT(SDIV_I);
499   case TargetOpcode::G_UDIV:
500     RTLIBCASE_INT(UDIV_I);
501   case TargetOpcode::G_SREM:
502     RTLIBCASE_INT(SREM_I);
503   case TargetOpcode::G_UREM:
504     RTLIBCASE_INT(UREM_I);
505   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
506     RTLIBCASE_INT(CTLZ_I);
507   case TargetOpcode::G_FADD:
508     RTLIBCASE(ADD_F);
509   case TargetOpcode::G_FSUB:
510     RTLIBCASE(SUB_F);
511   case TargetOpcode::G_FMUL:
512     RTLIBCASE(MUL_F);
513   case TargetOpcode::G_FDIV:
514     RTLIBCASE(DIV_F);
515   case TargetOpcode::G_FEXP:
516     RTLIBCASE(EXP_F);
517   case TargetOpcode::G_FEXP2:
518     RTLIBCASE(EXP2_F);
519   case TargetOpcode::G_FREM:
520     RTLIBCASE(REM_F);
521   case TargetOpcode::G_FPOW:
522     RTLIBCASE(POW_F);
523   case TargetOpcode::G_FMA:
524     RTLIBCASE(FMA_F);
525   case TargetOpcode::G_FSIN:
526     RTLIBCASE(SIN_F);
527   case TargetOpcode::G_FCOS:
528     RTLIBCASE(COS_F);
529   case TargetOpcode::G_FLOG10:
530     RTLIBCASE(LOG10_F);
531   case TargetOpcode::G_FLOG:
532     RTLIBCASE(LOG_F);
533   case TargetOpcode::G_FLOG2:
534     RTLIBCASE(LOG2_F);
535   case TargetOpcode::G_FCEIL:
536     RTLIBCASE(CEIL_F);
537   case TargetOpcode::G_FFLOOR:
538     RTLIBCASE(FLOOR_F);
539   case TargetOpcode::G_FMINNUM:
540     RTLIBCASE(FMIN_F);
541   case TargetOpcode::G_FMAXNUM:
542     RTLIBCASE(FMAX_F);
543   case TargetOpcode::G_FSQRT:
544     RTLIBCASE(SQRT_F);
545   case TargetOpcode::G_FRINT:
546     RTLIBCASE(RINT_F);
547   case TargetOpcode::G_FNEARBYINT:
548     RTLIBCASE(NEARBYINT_F);
549   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
550     RTLIBCASE(ROUNDEVEN_F);
551   }
552   llvm_unreachable("Unknown libcall function");
553 }
554 
555 /// True if an instruction is in tail position in its caller. Intended for
556 /// legalizing libcalls as tail calls when possible.
557 static bool isLibCallInTailPosition(MachineInstr &MI,
558                                     const TargetInstrInfo &TII,
559                                     MachineRegisterInfo &MRI) {
560   MachineBasicBlock &MBB = *MI.getParent();
561   const Function &F = MBB.getParent()->getFunction();
562 
563   // Conservatively require the attributes of the call to match those of
564   // the return. Ignore NoAlias and NonNull because they don't affect the
565   // call sequence.
566   AttributeList CallerAttrs = F.getAttributes();
567   if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
568           .removeAttribute(Attribute::NoAlias)
569           .removeAttribute(Attribute::NonNull)
570           .hasAttributes())
571     return false;
572 
573   // It's not safe to eliminate the sign / zero extension of the return value.
574   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
575       CallerAttrs.hasRetAttr(Attribute::SExt))
576     return false;
577 
578   // Only tail call if the following instruction is a standard return or if we
579   // have a `thisreturn` callee, and a sequence like:
580   //
581   //   G_MEMCPY %0, %1, %2
582   //   $x0 = COPY %0
583   //   RET_ReallyLR implicit $x0
584   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
585   if (Next != MBB.instr_end() && Next->isCopy()) {
586     switch (MI.getOpcode()) {
587     default:
588       llvm_unreachable("unsupported opcode");
589     case TargetOpcode::G_BZERO:
590       return false;
591     case TargetOpcode::G_MEMCPY:
592     case TargetOpcode::G_MEMMOVE:
593     case TargetOpcode::G_MEMSET:
594       break;
595     }
596 
597     Register VReg = MI.getOperand(0).getReg();
598     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
599       return false;
600 
601     Register PReg = Next->getOperand(0).getReg();
602     if (!PReg.isPhysical())
603       return false;
604 
605     auto Ret = next_nodbg(Next, MBB.instr_end());
606     if (Ret == MBB.instr_end() || !Ret->isReturn())
607       return false;
608 
609     if (Ret->getNumImplicitOperands() != 1)
610       return false;
611 
612     if (PReg != Ret->getOperand(0).getReg())
613       return false;
614 
615     // Skip over the COPY that we just validated.
616     Next = Ret;
617   }
618 
619   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
620     return false;
621 
622   return true;
623 }
624 
625 LegalizerHelper::LegalizeResult
626 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
627                     const CallLowering::ArgInfo &Result,
628                     ArrayRef<CallLowering::ArgInfo> Args,
629                     const CallingConv::ID CC) {
630   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
631 
632   CallLowering::CallLoweringInfo Info;
633   Info.CallConv = CC;
634   Info.Callee = MachineOperand::CreateES(Name);
635   Info.OrigRet = Result;
636   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
637   if (!CLI.lowerCall(MIRBuilder, Info))
638     return LegalizerHelper::UnableToLegalize;
639 
640   return LegalizerHelper::Legalized;
641 }
642 
643 LegalizerHelper::LegalizeResult
644 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
645                     const CallLowering::ArgInfo &Result,
646                     ArrayRef<CallLowering::ArgInfo> Args) {
647   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
648   const char *Name = TLI.getLibcallName(Libcall);
649   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
650   return createLibcall(MIRBuilder, Name, Result, Args, CC);
651 }
652 
653 // Useful for libcalls where all operands have the same type.
654 static LegalizerHelper::LegalizeResult
655 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
656               Type *OpType) {
657   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
658 
659   // FIXME: What does the original arg index mean here?
660   SmallVector<CallLowering::ArgInfo, 3> Args;
661   for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
662     Args.push_back({MO.getReg(), OpType, 0});
663   return createLibcall(MIRBuilder, Libcall,
664                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
665 }
666 
667 LegalizerHelper::LegalizeResult
668 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
669                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
670   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
671 
672   SmallVector<CallLowering::ArgInfo, 3> Args;
673   // Add all the args, except for the last which is an imm denoting 'tail'.
674   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
675     Register Reg = MI.getOperand(i).getReg();
676 
677     // Need derive an IR type for call lowering.
678     LLT OpLLT = MRI.getType(Reg);
679     Type *OpTy = nullptr;
680     if (OpLLT.isPointer())
681       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
682     else
683       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
684     Args.push_back({Reg, OpTy, 0});
685   }
686 
687   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
688   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
689   RTLIB::Libcall RTLibcall;
690   unsigned Opc = MI.getOpcode();
691   switch (Opc) {
692   case TargetOpcode::G_BZERO:
693     RTLibcall = RTLIB::BZERO;
694     break;
695   case TargetOpcode::G_MEMCPY:
696     RTLibcall = RTLIB::MEMCPY;
697     Args[0].Flags[0].setReturned();
698     break;
699   case TargetOpcode::G_MEMMOVE:
700     RTLibcall = RTLIB::MEMMOVE;
701     Args[0].Flags[0].setReturned();
702     break;
703   case TargetOpcode::G_MEMSET:
704     RTLibcall = RTLIB::MEMSET;
705     Args[0].Flags[0].setReturned();
706     break;
707   default:
708     llvm_unreachable("unsupported opcode");
709   }
710   const char *Name = TLI.getLibcallName(RTLibcall);
711 
712   // Unsupported libcall on the target.
713   if (!Name) {
714     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
715                       << MIRBuilder.getTII().getName(Opc) << "\n");
716     return LegalizerHelper::UnableToLegalize;
717   }
718 
719   CallLowering::CallLoweringInfo Info;
720   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
721   Info.Callee = MachineOperand::CreateES(Name);
722   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
723   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
724                     isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
725 
726   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
727   if (!CLI.lowerCall(MIRBuilder, Info))
728     return LegalizerHelper::UnableToLegalize;
729 
730   if (Info.LoweredTailCall) {
731     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
732 
733     // Check debug locations before removing the return.
734     LocObserver.checkpoint(true);
735 
736     // We must have a return following the call (or debug insts) to get past
737     // isLibCallInTailPosition.
738     do {
739       MachineInstr *Next = MI.getNextNode();
740       assert(Next &&
741              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
742              "Expected instr following MI to be return or debug inst?");
743       // We lowered a tail call, so the call is now the return from the block.
744       // Delete the old return.
745       Next->eraseFromParent();
746     } while (MI.getNextNode());
747 
748     // We expect to lose the debug location from the return.
749     LocObserver.checkpoint(false);
750   }
751 
752   return LegalizerHelper::Legalized;
753 }
754 
755 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
756                                        Type *FromType) {
757   auto ToMVT = MVT::getVT(ToType);
758   auto FromMVT = MVT::getVT(FromType);
759 
760   switch (Opcode) {
761   case TargetOpcode::G_FPEXT:
762     return RTLIB::getFPEXT(FromMVT, ToMVT);
763   case TargetOpcode::G_FPTRUNC:
764     return RTLIB::getFPROUND(FromMVT, ToMVT);
765   case TargetOpcode::G_FPTOSI:
766     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
767   case TargetOpcode::G_FPTOUI:
768     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
769   case TargetOpcode::G_SITOFP:
770     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
771   case TargetOpcode::G_UITOFP:
772     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
773   }
774   llvm_unreachable("Unsupported libcall function");
775 }
776 
777 static LegalizerHelper::LegalizeResult
778 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
779                   Type *FromType) {
780   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
781   return createLibcall(MIRBuilder, Libcall,
782                        {MI.getOperand(0).getReg(), ToType, 0},
783                        {{MI.getOperand(1).getReg(), FromType, 0}});
784 }
785 
786 LegalizerHelper::LegalizeResult
787 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
788   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
789   unsigned Size = LLTy.getSizeInBits();
790   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
791 
792   switch (MI.getOpcode()) {
793   default:
794     return UnableToLegalize;
795   case TargetOpcode::G_SDIV:
796   case TargetOpcode::G_UDIV:
797   case TargetOpcode::G_SREM:
798   case TargetOpcode::G_UREM:
799   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
800     Type *HLTy = IntegerType::get(Ctx, Size);
801     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
802     if (Status != Legalized)
803       return Status;
804     break;
805   }
806   case TargetOpcode::G_FADD:
807   case TargetOpcode::G_FSUB:
808   case TargetOpcode::G_FMUL:
809   case TargetOpcode::G_FDIV:
810   case TargetOpcode::G_FMA:
811   case TargetOpcode::G_FPOW:
812   case TargetOpcode::G_FREM:
813   case TargetOpcode::G_FCOS:
814   case TargetOpcode::G_FSIN:
815   case TargetOpcode::G_FLOG10:
816   case TargetOpcode::G_FLOG:
817   case TargetOpcode::G_FLOG2:
818   case TargetOpcode::G_FEXP:
819   case TargetOpcode::G_FEXP2:
820   case TargetOpcode::G_FCEIL:
821   case TargetOpcode::G_FFLOOR:
822   case TargetOpcode::G_FMINNUM:
823   case TargetOpcode::G_FMAXNUM:
824   case TargetOpcode::G_FSQRT:
825   case TargetOpcode::G_FRINT:
826   case TargetOpcode::G_FNEARBYINT:
827   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
828     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
829     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
830       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
831       return UnableToLegalize;
832     }
833     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
834     if (Status != Legalized)
835       return Status;
836     break;
837   }
838   case TargetOpcode::G_FPEXT:
839   case TargetOpcode::G_FPTRUNC: {
840     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
841     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
842     if (!FromTy || !ToTy)
843       return UnableToLegalize;
844     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
845     if (Status != Legalized)
846       return Status;
847     break;
848   }
849   case TargetOpcode::G_FPTOSI:
850   case TargetOpcode::G_FPTOUI: {
851     // FIXME: Support other types
852     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
853     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
854     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
855       return UnableToLegalize;
856     LegalizeResult Status = conversionLibcall(
857         MI, MIRBuilder,
858         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
859         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
860     if (Status != Legalized)
861       return Status;
862     break;
863   }
864   case TargetOpcode::G_SITOFP:
865   case TargetOpcode::G_UITOFP: {
866     // FIXME: Support other types
867     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
868     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
869     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
870       return UnableToLegalize;
871     LegalizeResult Status = conversionLibcall(
872         MI, MIRBuilder,
873         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
874         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
875     if (Status != Legalized)
876       return Status;
877     break;
878   }
879   case TargetOpcode::G_BZERO:
880   case TargetOpcode::G_MEMCPY:
881   case TargetOpcode::G_MEMMOVE:
882   case TargetOpcode::G_MEMSET: {
883     LegalizeResult Result =
884         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
885     if (Result != Legalized)
886       return Result;
887     MI.eraseFromParent();
888     return Result;
889   }
890   }
891 
892   MI.eraseFromParent();
893   return Legalized;
894 }
895 
896 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
897                                                               unsigned TypeIdx,
898                                                               LLT NarrowTy) {
899   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
900   uint64_t NarrowSize = NarrowTy.getSizeInBits();
901 
902   switch (MI.getOpcode()) {
903   default:
904     return UnableToLegalize;
905   case TargetOpcode::G_IMPLICIT_DEF: {
906     Register DstReg = MI.getOperand(0).getReg();
907     LLT DstTy = MRI.getType(DstReg);
908 
909     // If SizeOp0 is not an exact multiple of NarrowSize, emit
910     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
911     // FIXME: Although this would also be legal for the general case, it causes
912     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
913     //  combines not being hit). This seems to be a problem related to the
914     //  artifact combiner.
915     if (SizeOp0 % NarrowSize != 0) {
916       LLT ImplicitTy = NarrowTy;
917       if (DstTy.isVector())
918         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
919 
920       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
921       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
922 
923       MI.eraseFromParent();
924       return Legalized;
925     }
926 
927     int NumParts = SizeOp0 / NarrowSize;
928 
929     SmallVector<Register, 2> DstRegs;
930     for (int i = 0; i < NumParts; ++i)
931       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
932 
933     if (DstTy.isVector())
934       MIRBuilder.buildBuildVector(DstReg, DstRegs);
935     else
936       MIRBuilder.buildMerge(DstReg, DstRegs);
937     MI.eraseFromParent();
938     return Legalized;
939   }
940   case TargetOpcode::G_CONSTANT: {
941     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
942     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
943     unsigned TotalSize = Ty.getSizeInBits();
944     unsigned NarrowSize = NarrowTy.getSizeInBits();
945     int NumParts = TotalSize / NarrowSize;
946 
947     SmallVector<Register, 4> PartRegs;
948     for (int I = 0; I != NumParts; ++I) {
949       unsigned Offset = I * NarrowSize;
950       auto K = MIRBuilder.buildConstant(NarrowTy,
951                                         Val.lshr(Offset).trunc(NarrowSize));
952       PartRegs.push_back(K.getReg(0));
953     }
954 
955     LLT LeftoverTy;
956     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
957     SmallVector<Register, 1> LeftoverRegs;
958     if (LeftoverBits != 0) {
959       LeftoverTy = LLT::scalar(LeftoverBits);
960       auto K = MIRBuilder.buildConstant(
961         LeftoverTy,
962         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
963       LeftoverRegs.push_back(K.getReg(0));
964     }
965 
966     insertParts(MI.getOperand(0).getReg(),
967                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
968 
969     MI.eraseFromParent();
970     return Legalized;
971   }
972   case TargetOpcode::G_SEXT:
973   case TargetOpcode::G_ZEXT:
974   case TargetOpcode::G_ANYEXT:
975     return narrowScalarExt(MI, TypeIdx, NarrowTy);
976   case TargetOpcode::G_TRUNC: {
977     if (TypeIdx != 1)
978       return UnableToLegalize;
979 
980     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
981     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
982       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
983       return UnableToLegalize;
984     }
985 
986     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
987     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
988     MI.eraseFromParent();
989     return Legalized;
990   }
991 
992   case TargetOpcode::G_FREEZE: {
993     if (TypeIdx != 0)
994       return UnableToLegalize;
995 
996     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
997     // Should widen scalar first
998     if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
999       return UnableToLegalize;
1000 
1001     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1002     SmallVector<Register, 8> Parts;
1003     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1004       Parts.push_back(
1005           MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
1006     }
1007 
1008     MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Parts);
1009     MI.eraseFromParent();
1010     return Legalized;
1011   }
1012   case TargetOpcode::G_ADD:
1013   case TargetOpcode::G_SUB:
1014   case TargetOpcode::G_SADDO:
1015   case TargetOpcode::G_SSUBO:
1016   case TargetOpcode::G_SADDE:
1017   case TargetOpcode::G_SSUBE:
1018   case TargetOpcode::G_UADDO:
1019   case TargetOpcode::G_USUBO:
1020   case TargetOpcode::G_UADDE:
1021   case TargetOpcode::G_USUBE:
1022     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1023   case TargetOpcode::G_MUL:
1024   case TargetOpcode::G_UMULH:
1025     return narrowScalarMul(MI, NarrowTy);
1026   case TargetOpcode::G_EXTRACT:
1027     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1028   case TargetOpcode::G_INSERT:
1029     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1030   case TargetOpcode::G_LOAD: {
1031     auto &LoadMI = cast<GLoad>(MI);
1032     Register DstReg = LoadMI.getDstReg();
1033     LLT DstTy = MRI.getType(DstReg);
1034     if (DstTy.isVector())
1035       return UnableToLegalize;
1036 
1037     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
1038       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1039       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1040       MIRBuilder.buildAnyExt(DstReg, TmpReg);
1041       LoadMI.eraseFromParent();
1042       return Legalized;
1043     }
1044 
1045     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1046   }
1047   case TargetOpcode::G_ZEXTLOAD:
1048   case TargetOpcode::G_SEXTLOAD: {
1049     auto &LoadMI = cast<GExtLoad>(MI);
1050     Register DstReg = LoadMI.getDstReg();
1051     Register PtrReg = LoadMI.getPointerReg();
1052 
1053     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1054     auto &MMO = LoadMI.getMMO();
1055     unsigned MemSize = MMO.getSizeInBits();
1056 
1057     if (MemSize == NarrowSize) {
1058       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1059     } else if (MemSize < NarrowSize) {
1060       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1061     } else if (MemSize > NarrowSize) {
1062       // FIXME: Need to split the load.
1063       return UnableToLegalize;
1064     }
1065 
1066     if (isa<GZExtLoad>(LoadMI))
1067       MIRBuilder.buildZExt(DstReg, TmpReg);
1068     else
1069       MIRBuilder.buildSExt(DstReg, TmpReg);
1070 
1071     LoadMI.eraseFromParent();
1072     return Legalized;
1073   }
1074   case TargetOpcode::G_STORE: {
1075     auto &StoreMI = cast<GStore>(MI);
1076 
1077     Register SrcReg = StoreMI.getValueReg();
1078     LLT SrcTy = MRI.getType(SrcReg);
1079     if (SrcTy.isVector())
1080       return UnableToLegalize;
1081 
1082     int NumParts = SizeOp0 / NarrowSize;
1083     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1084     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1085     if (SrcTy.isVector() && LeftoverBits != 0)
1086       return UnableToLegalize;
1087 
1088     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
1089       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1090       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1091       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1092       StoreMI.eraseFromParent();
1093       return Legalized;
1094     }
1095 
1096     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1097   }
1098   case TargetOpcode::G_SELECT:
1099     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1100   case TargetOpcode::G_AND:
1101   case TargetOpcode::G_OR:
1102   case TargetOpcode::G_XOR: {
1103     // Legalize bitwise operation:
1104     // A = BinOp<Ty> B, C
1105     // into:
1106     // B1, ..., BN = G_UNMERGE_VALUES B
1107     // C1, ..., CN = G_UNMERGE_VALUES C
1108     // A1 = BinOp<Ty/N> B1, C2
1109     // ...
1110     // AN = BinOp<Ty/N> BN, CN
1111     // A = G_MERGE_VALUES A1, ..., AN
1112     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1113   }
1114   case TargetOpcode::G_SHL:
1115   case TargetOpcode::G_LSHR:
1116   case TargetOpcode::G_ASHR:
1117     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1118   case TargetOpcode::G_CTLZ:
1119   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1120   case TargetOpcode::G_CTTZ:
1121   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1122   case TargetOpcode::G_CTPOP:
1123     if (TypeIdx == 1)
1124       switch (MI.getOpcode()) {
1125       case TargetOpcode::G_CTLZ:
1126       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1127         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1128       case TargetOpcode::G_CTTZ:
1129       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1130         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1131       case TargetOpcode::G_CTPOP:
1132         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1133       default:
1134         return UnableToLegalize;
1135       }
1136 
1137     Observer.changingInstr(MI);
1138     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1139     Observer.changedInstr(MI);
1140     return Legalized;
1141   case TargetOpcode::G_INTTOPTR:
1142     if (TypeIdx != 1)
1143       return UnableToLegalize;
1144 
1145     Observer.changingInstr(MI);
1146     narrowScalarSrc(MI, NarrowTy, 1);
1147     Observer.changedInstr(MI);
1148     return Legalized;
1149   case TargetOpcode::G_PTRTOINT:
1150     if (TypeIdx != 0)
1151       return UnableToLegalize;
1152 
1153     Observer.changingInstr(MI);
1154     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1155     Observer.changedInstr(MI);
1156     return Legalized;
1157   case TargetOpcode::G_PHI: {
1158     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1159     // NarrowSize.
1160     if (SizeOp0 % NarrowSize != 0)
1161       return UnableToLegalize;
1162 
1163     unsigned NumParts = SizeOp0 / NarrowSize;
1164     SmallVector<Register, 2> DstRegs(NumParts);
1165     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1166     Observer.changingInstr(MI);
1167     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1168       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1169       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1170       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1171                    SrcRegs[i / 2]);
1172     }
1173     MachineBasicBlock &MBB = *MI.getParent();
1174     MIRBuilder.setInsertPt(MBB, MI);
1175     for (unsigned i = 0; i < NumParts; ++i) {
1176       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1177       MachineInstrBuilder MIB =
1178           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1179       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1180         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1181     }
1182     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1183     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1184     Observer.changedInstr(MI);
1185     MI.eraseFromParent();
1186     return Legalized;
1187   }
1188   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1189   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1190     if (TypeIdx != 2)
1191       return UnableToLegalize;
1192 
1193     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1194     Observer.changingInstr(MI);
1195     narrowScalarSrc(MI, NarrowTy, OpIdx);
1196     Observer.changedInstr(MI);
1197     return Legalized;
1198   }
1199   case TargetOpcode::G_ICMP: {
1200     Register LHS = MI.getOperand(2).getReg();
1201     LLT SrcTy = MRI.getType(LHS);
1202     uint64_t SrcSize = SrcTy.getSizeInBits();
1203     CmpInst::Predicate Pred =
1204         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1205 
1206     // TODO: Handle the non-equality case for weird sizes.
1207     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1208       return UnableToLegalize;
1209 
1210     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1211     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1212     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1213                       LHSLeftoverRegs))
1214       return UnableToLegalize;
1215 
1216     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1217     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1218     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1219                       RHSPartRegs, RHSLeftoverRegs))
1220       return UnableToLegalize;
1221 
1222     // We now have the LHS and RHS of the compare split into narrow-type
1223     // registers, plus potentially some leftover type.
1224     Register Dst = MI.getOperand(0).getReg();
1225     LLT ResTy = MRI.getType(Dst);
1226     if (ICmpInst::isEquality(Pred)) {
1227       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1228       // them together. For each equal part, the result should be all 0s. For
1229       // each non-equal part, we'll get at least one 1.
1230       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1231       SmallVector<Register, 4> Xors;
1232       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1233         auto LHS = std::get<0>(LHSAndRHS);
1234         auto RHS = std::get<1>(LHSAndRHS);
1235         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1236         Xors.push_back(Xor);
1237       }
1238 
1239       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1240       // to the desired narrow type so that we can OR them together later.
1241       SmallVector<Register, 4> WidenedXors;
1242       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1243         auto LHS = std::get<0>(LHSAndRHS);
1244         auto RHS = std::get<1>(LHSAndRHS);
1245         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1246         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1247         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1248                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1249         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1250       }
1251 
1252       // Now, for each part we broke up, we know if they are equal/not equal
1253       // based off the G_XOR. We can OR these all together and compare against
1254       // 0 to get the result.
1255       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1256       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1257       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1258         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1259       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1260     } else {
1261       // TODO: Handle non-power-of-two types.
1262       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1263       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1264       Register LHSL = LHSPartRegs[0];
1265       Register LHSH = LHSPartRegs[1];
1266       Register RHSL = RHSPartRegs[0];
1267       Register RHSH = RHSPartRegs[1];
1268       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1269       MachineInstrBuilder CmpHEQ =
1270           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1271       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1272           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1273       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1274     }
1275     MI.eraseFromParent();
1276     return Legalized;
1277   }
1278   case TargetOpcode::G_SEXT_INREG: {
1279     if (TypeIdx != 0)
1280       return UnableToLegalize;
1281 
1282     int64_t SizeInBits = MI.getOperand(2).getImm();
1283 
1284     // So long as the new type has more bits than the bits we're extending we
1285     // don't need to break it apart.
1286     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1287       Observer.changingInstr(MI);
1288       // We don't lose any non-extension bits by truncating the src and
1289       // sign-extending the dst.
1290       MachineOperand &MO1 = MI.getOperand(1);
1291       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1292       MO1.setReg(TruncMIB.getReg(0));
1293 
1294       MachineOperand &MO2 = MI.getOperand(0);
1295       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1296       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1297       MIRBuilder.buildSExt(MO2, DstExt);
1298       MO2.setReg(DstExt);
1299       Observer.changedInstr(MI);
1300       return Legalized;
1301     }
1302 
1303     // Break it apart. Components below the extension point are unmodified. The
1304     // component containing the extension point becomes a narrower SEXT_INREG.
1305     // Components above it are ashr'd from the component containing the
1306     // extension point.
1307     if (SizeOp0 % NarrowSize != 0)
1308       return UnableToLegalize;
1309     int NumParts = SizeOp0 / NarrowSize;
1310 
1311     // List the registers where the destination will be scattered.
1312     SmallVector<Register, 2> DstRegs;
1313     // List the registers where the source will be split.
1314     SmallVector<Register, 2> SrcRegs;
1315 
1316     // Create all the temporary registers.
1317     for (int i = 0; i < NumParts; ++i) {
1318       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1319 
1320       SrcRegs.push_back(SrcReg);
1321     }
1322 
1323     // Explode the big arguments into smaller chunks.
1324     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1325 
1326     Register AshrCstReg =
1327         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1328             .getReg(0);
1329     Register FullExtensionReg = 0;
1330     Register PartialExtensionReg = 0;
1331 
1332     // Do the operation on each small part.
1333     for (int i = 0; i < NumParts; ++i) {
1334       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1335         DstRegs.push_back(SrcRegs[i]);
1336       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1337         assert(PartialExtensionReg &&
1338                "Expected to visit partial extension before full");
1339         if (FullExtensionReg) {
1340           DstRegs.push_back(FullExtensionReg);
1341           continue;
1342         }
1343         DstRegs.push_back(
1344             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1345                 .getReg(0));
1346         FullExtensionReg = DstRegs.back();
1347       } else {
1348         DstRegs.push_back(
1349             MIRBuilder
1350                 .buildInstr(
1351                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1352                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1353                 .getReg(0));
1354         PartialExtensionReg = DstRegs.back();
1355       }
1356     }
1357 
1358     // Gather the destination registers into the final destination.
1359     Register DstReg = MI.getOperand(0).getReg();
1360     MIRBuilder.buildMerge(DstReg, DstRegs);
1361     MI.eraseFromParent();
1362     return Legalized;
1363   }
1364   case TargetOpcode::G_BSWAP:
1365   case TargetOpcode::G_BITREVERSE: {
1366     if (SizeOp0 % NarrowSize != 0)
1367       return UnableToLegalize;
1368 
1369     Observer.changingInstr(MI);
1370     SmallVector<Register, 2> SrcRegs, DstRegs;
1371     unsigned NumParts = SizeOp0 / NarrowSize;
1372     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1373 
1374     for (unsigned i = 0; i < NumParts; ++i) {
1375       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1376                                            {SrcRegs[NumParts - 1 - i]});
1377       DstRegs.push_back(DstPart.getReg(0));
1378     }
1379 
1380     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1381 
1382     Observer.changedInstr(MI);
1383     MI.eraseFromParent();
1384     return Legalized;
1385   }
1386   case TargetOpcode::G_PTR_ADD:
1387   case TargetOpcode::G_PTRMASK: {
1388     if (TypeIdx != 1)
1389       return UnableToLegalize;
1390     Observer.changingInstr(MI);
1391     narrowScalarSrc(MI, NarrowTy, 2);
1392     Observer.changedInstr(MI);
1393     return Legalized;
1394   }
1395   case TargetOpcode::G_FPTOUI:
1396   case TargetOpcode::G_FPTOSI:
1397     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1398   case TargetOpcode::G_FPEXT:
1399     if (TypeIdx != 0)
1400       return UnableToLegalize;
1401     Observer.changingInstr(MI);
1402     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1403     Observer.changedInstr(MI);
1404     return Legalized;
1405   }
1406 }
1407 
1408 Register LegalizerHelper::coerceToScalar(Register Val) {
1409   LLT Ty = MRI.getType(Val);
1410   if (Ty.isScalar())
1411     return Val;
1412 
1413   const DataLayout &DL = MIRBuilder.getDataLayout();
1414   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1415   if (Ty.isPointer()) {
1416     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1417       return Register();
1418     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1419   }
1420 
1421   Register NewVal = Val;
1422 
1423   assert(Ty.isVector());
1424   LLT EltTy = Ty.getElementType();
1425   if (EltTy.isPointer())
1426     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1427   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1428 }
1429 
1430 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1431                                      unsigned OpIdx, unsigned ExtOpcode) {
1432   MachineOperand &MO = MI.getOperand(OpIdx);
1433   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1434   MO.setReg(ExtB.getReg(0));
1435 }
1436 
1437 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1438                                       unsigned OpIdx) {
1439   MachineOperand &MO = MI.getOperand(OpIdx);
1440   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1441   MO.setReg(ExtB.getReg(0));
1442 }
1443 
1444 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1445                                      unsigned OpIdx, unsigned TruncOpcode) {
1446   MachineOperand &MO = MI.getOperand(OpIdx);
1447   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1448   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1449   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1450   MO.setReg(DstExt);
1451 }
1452 
1453 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1454                                       unsigned OpIdx, unsigned ExtOpcode) {
1455   MachineOperand &MO = MI.getOperand(OpIdx);
1456   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1457   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1458   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1459   MO.setReg(DstTrunc);
1460 }
1461 
1462 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1463                                             unsigned OpIdx) {
1464   MachineOperand &MO = MI.getOperand(OpIdx);
1465   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1466   Register Dst = MO.getReg();
1467   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1468   MO.setReg(DstExt);
1469   MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
1470 }
1471 
1472 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1473                                             unsigned OpIdx) {
1474   MachineOperand &MO = MI.getOperand(OpIdx);
1475   SmallVector<Register, 8> Regs;
1476   MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
1477 }
1478 
1479 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1480   MachineOperand &Op = MI.getOperand(OpIdx);
1481   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1482 }
1483 
1484 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1485   MachineOperand &MO = MI.getOperand(OpIdx);
1486   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1487   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1488   MIRBuilder.buildBitcast(MO, CastDst);
1489   MO.setReg(CastDst);
1490 }
1491 
1492 LegalizerHelper::LegalizeResult
1493 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1494                                         LLT WideTy) {
1495   if (TypeIdx != 1)
1496     return UnableToLegalize;
1497 
1498   Register DstReg = MI.getOperand(0).getReg();
1499   LLT DstTy = MRI.getType(DstReg);
1500   if (DstTy.isVector())
1501     return UnableToLegalize;
1502 
1503   Register Src1 = MI.getOperand(1).getReg();
1504   LLT SrcTy = MRI.getType(Src1);
1505   const int DstSize = DstTy.getSizeInBits();
1506   const int SrcSize = SrcTy.getSizeInBits();
1507   const int WideSize = WideTy.getSizeInBits();
1508   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1509 
1510   unsigned NumOps = MI.getNumOperands();
1511   unsigned NumSrc = MI.getNumOperands() - 1;
1512   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1513 
1514   if (WideSize >= DstSize) {
1515     // Directly pack the bits in the target type.
1516     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1517 
1518     for (unsigned I = 2; I != NumOps; ++I) {
1519       const unsigned Offset = (I - 1) * PartSize;
1520 
1521       Register SrcReg = MI.getOperand(I).getReg();
1522       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1523 
1524       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1525 
1526       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1527         MRI.createGenericVirtualRegister(WideTy);
1528 
1529       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1530       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1531       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1532       ResultReg = NextResult;
1533     }
1534 
1535     if (WideSize > DstSize)
1536       MIRBuilder.buildTrunc(DstReg, ResultReg);
1537     else if (DstTy.isPointer())
1538       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1539 
1540     MI.eraseFromParent();
1541     return Legalized;
1542   }
1543 
1544   // Unmerge the original values to the GCD type, and recombine to the next
1545   // multiple greater than the original type.
1546   //
1547   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1548   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1549   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1550   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1551   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1552   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1553   // %12:_(s12) = G_MERGE_VALUES %10, %11
1554   //
1555   // Padding with undef if necessary:
1556   //
1557   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1558   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1559   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1560   // %7:_(s2) = G_IMPLICIT_DEF
1561   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1562   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1563   // %10:_(s12) = G_MERGE_VALUES %8, %9
1564 
1565   const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1566   LLT GCDTy = LLT::scalar(GCD);
1567 
1568   SmallVector<Register, 8> Parts;
1569   SmallVector<Register, 8> NewMergeRegs;
1570   SmallVector<Register, 8> Unmerges;
1571   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1572 
1573   // Decompose the original operands if they don't evenly divide.
1574   for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
1575     Register SrcReg = MO.getReg();
1576     if (GCD == SrcSize) {
1577       Unmerges.push_back(SrcReg);
1578     } else {
1579       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1580       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1581         Unmerges.push_back(Unmerge.getReg(J));
1582     }
1583   }
1584 
1585   // Pad with undef to the next size that is a multiple of the requested size.
1586   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1587     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1588     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1589       Unmerges.push_back(UndefReg);
1590   }
1591 
1592   const int PartsPerGCD = WideSize / GCD;
1593 
1594   // Build merges of each piece.
1595   ArrayRef<Register> Slicer(Unmerges);
1596   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1597     auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1598     NewMergeRegs.push_back(Merge.getReg(0));
1599   }
1600 
1601   // A truncate may be necessary if the requested type doesn't evenly divide the
1602   // original result type.
1603   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1604     MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1605   } else {
1606     auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1607     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1608   }
1609 
1610   MI.eraseFromParent();
1611   return Legalized;
1612 }
1613 
1614 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
1615   Register WideReg = MRI.createGenericVirtualRegister(WideTy);
1616   LLT OrigTy = MRI.getType(OrigReg);
1617   LLT LCMTy = getLCMType(WideTy, OrigTy);
1618 
1619   const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
1620   const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
1621 
1622   Register UnmergeSrc = WideReg;
1623 
1624   // Create a merge to the LCM type, padding with undef
1625   // %0:_(<3 x s32>) = G_FOO => <4 x s32>
1626   // =>
1627   // %1:_(<4 x s32>) = G_FOO
1628   // %2:_(<4 x s32>) = G_IMPLICIT_DEF
1629   // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
1630   // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
1631   if (NumMergeParts > 1) {
1632     Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
1633     SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
1634     MergeParts[0] = WideReg;
1635     UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
1636   }
1637 
1638   // Unmerge to the original register and pad with dead defs.
1639   SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
1640   UnmergeResults[0] = OrigReg;
1641   for (int I = 1; I != NumUnmergeParts; ++I)
1642     UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
1643 
1644   MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
1645   return WideReg;
1646 }
1647 
1648 LegalizerHelper::LegalizeResult
1649 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1650                                           LLT WideTy) {
1651   if (TypeIdx != 0)
1652     return UnableToLegalize;
1653 
1654   int NumDst = MI.getNumOperands() - 1;
1655   Register SrcReg = MI.getOperand(NumDst).getReg();
1656   LLT SrcTy = MRI.getType(SrcReg);
1657   if (SrcTy.isVector())
1658     return UnableToLegalize;
1659 
1660   Register Dst0Reg = MI.getOperand(0).getReg();
1661   LLT DstTy = MRI.getType(Dst0Reg);
1662   if (!DstTy.isScalar())
1663     return UnableToLegalize;
1664 
1665   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1666     if (SrcTy.isPointer()) {
1667       const DataLayout &DL = MIRBuilder.getDataLayout();
1668       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1669         LLVM_DEBUG(
1670             dbgs() << "Not casting non-integral address space integer\n");
1671         return UnableToLegalize;
1672       }
1673 
1674       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1675       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1676     }
1677 
1678     // Widen SrcTy to WideTy. This does not affect the result, but since the
1679     // user requested this size, it is probably better handled than SrcTy and
1680     // should reduce the total number of legalization artifacts.
1681     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1682       SrcTy = WideTy;
1683       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1684     }
1685 
1686     // Theres no unmerge type to target. Directly extract the bits from the
1687     // source type
1688     unsigned DstSize = DstTy.getSizeInBits();
1689 
1690     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1691     for (int I = 1; I != NumDst; ++I) {
1692       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1693       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1694       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1695     }
1696 
1697     MI.eraseFromParent();
1698     return Legalized;
1699   }
1700 
1701   // Extend the source to a wider type.
1702   LLT LCMTy = getLCMType(SrcTy, WideTy);
1703 
1704   Register WideSrc = SrcReg;
1705   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1706     // TODO: If this is an integral address space, cast to integer and anyext.
1707     if (SrcTy.isPointer()) {
1708       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1709       return UnableToLegalize;
1710     }
1711 
1712     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1713   }
1714 
1715   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1716 
1717   // Create a sequence of unmerges and merges to the original results. Since we
1718   // may have widened the source, we will need to pad the results with dead defs
1719   // to cover the source register.
1720   // e.g. widen s48 to s64:
1721   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1722   //
1723   // =>
1724   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1725   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1726   //  ; unpack to GCD type, with extra dead defs
1727   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1728   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1729   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1730   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1731   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1732   const LLT GCDTy = getGCDType(WideTy, DstTy);
1733   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1734   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1735 
1736   // Directly unmerge to the destination without going through a GCD type
1737   // if possible
1738   if (PartsPerRemerge == 1) {
1739     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1740 
1741     for (int I = 0; I != NumUnmerge; ++I) {
1742       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1743 
1744       for (int J = 0; J != PartsPerUnmerge; ++J) {
1745         int Idx = I * PartsPerUnmerge + J;
1746         if (Idx < NumDst)
1747           MIB.addDef(MI.getOperand(Idx).getReg());
1748         else {
1749           // Create dead def for excess components.
1750           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1751         }
1752       }
1753 
1754       MIB.addUse(Unmerge.getReg(I));
1755     }
1756   } else {
1757     SmallVector<Register, 16> Parts;
1758     for (int J = 0; J != NumUnmerge; ++J)
1759       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1760 
1761     SmallVector<Register, 8> RemergeParts;
1762     for (int I = 0; I != NumDst; ++I) {
1763       for (int J = 0; J < PartsPerRemerge; ++J) {
1764         const int Idx = I * PartsPerRemerge + J;
1765         RemergeParts.emplace_back(Parts[Idx]);
1766       }
1767 
1768       MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1769       RemergeParts.clear();
1770     }
1771   }
1772 
1773   MI.eraseFromParent();
1774   return Legalized;
1775 }
1776 
1777 LegalizerHelper::LegalizeResult
1778 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1779                                     LLT WideTy) {
1780   Register DstReg = MI.getOperand(0).getReg();
1781   Register SrcReg = MI.getOperand(1).getReg();
1782   LLT SrcTy = MRI.getType(SrcReg);
1783 
1784   LLT DstTy = MRI.getType(DstReg);
1785   unsigned Offset = MI.getOperand(2).getImm();
1786 
1787   if (TypeIdx == 0) {
1788     if (SrcTy.isVector() || DstTy.isVector())
1789       return UnableToLegalize;
1790 
1791     SrcOp Src(SrcReg);
1792     if (SrcTy.isPointer()) {
1793       // Extracts from pointers can be handled only if they are really just
1794       // simple integers.
1795       const DataLayout &DL = MIRBuilder.getDataLayout();
1796       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1797         return UnableToLegalize;
1798 
1799       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1800       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1801       SrcTy = SrcAsIntTy;
1802     }
1803 
1804     if (DstTy.isPointer())
1805       return UnableToLegalize;
1806 
1807     if (Offset == 0) {
1808       // Avoid a shift in the degenerate case.
1809       MIRBuilder.buildTrunc(DstReg,
1810                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1811       MI.eraseFromParent();
1812       return Legalized;
1813     }
1814 
1815     // Do a shift in the source type.
1816     LLT ShiftTy = SrcTy;
1817     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1818       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1819       ShiftTy = WideTy;
1820     }
1821 
1822     auto LShr = MIRBuilder.buildLShr(
1823       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1824     MIRBuilder.buildTrunc(DstReg, LShr);
1825     MI.eraseFromParent();
1826     return Legalized;
1827   }
1828 
1829   if (SrcTy.isScalar()) {
1830     Observer.changingInstr(MI);
1831     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1832     Observer.changedInstr(MI);
1833     return Legalized;
1834   }
1835 
1836   if (!SrcTy.isVector())
1837     return UnableToLegalize;
1838 
1839   if (DstTy != SrcTy.getElementType())
1840     return UnableToLegalize;
1841 
1842   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1843     return UnableToLegalize;
1844 
1845   Observer.changingInstr(MI);
1846   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1847 
1848   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1849                           Offset);
1850   widenScalarDst(MI, WideTy.getScalarType(), 0);
1851   Observer.changedInstr(MI);
1852   return Legalized;
1853 }
1854 
1855 LegalizerHelper::LegalizeResult
1856 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1857                                    LLT WideTy) {
1858   if (TypeIdx != 0 || WideTy.isVector())
1859     return UnableToLegalize;
1860   Observer.changingInstr(MI);
1861   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1862   widenScalarDst(MI, WideTy);
1863   Observer.changedInstr(MI);
1864   return Legalized;
1865 }
1866 
1867 LegalizerHelper::LegalizeResult
1868 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1869                                            LLT WideTy) {
1870   if (TypeIdx == 1)
1871     return UnableToLegalize; // TODO
1872 
1873   unsigned Opcode;
1874   unsigned ExtOpcode;
1875   Optional<Register> CarryIn = None;
1876   switch (MI.getOpcode()) {
1877   default:
1878     llvm_unreachable("Unexpected opcode!");
1879   case TargetOpcode::G_SADDO:
1880     Opcode = TargetOpcode::G_ADD;
1881     ExtOpcode = TargetOpcode::G_SEXT;
1882     break;
1883   case TargetOpcode::G_SSUBO:
1884     Opcode = TargetOpcode::G_SUB;
1885     ExtOpcode = TargetOpcode::G_SEXT;
1886     break;
1887   case TargetOpcode::G_UADDO:
1888     Opcode = TargetOpcode::G_ADD;
1889     ExtOpcode = TargetOpcode::G_ZEXT;
1890     break;
1891   case TargetOpcode::G_USUBO:
1892     Opcode = TargetOpcode::G_SUB;
1893     ExtOpcode = TargetOpcode::G_ZEXT;
1894     break;
1895   case TargetOpcode::G_SADDE:
1896     Opcode = TargetOpcode::G_UADDE;
1897     ExtOpcode = TargetOpcode::G_SEXT;
1898     CarryIn = MI.getOperand(4).getReg();
1899     break;
1900   case TargetOpcode::G_SSUBE:
1901     Opcode = TargetOpcode::G_USUBE;
1902     ExtOpcode = TargetOpcode::G_SEXT;
1903     CarryIn = MI.getOperand(4).getReg();
1904     break;
1905   case TargetOpcode::G_UADDE:
1906     Opcode = TargetOpcode::G_UADDE;
1907     ExtOpcode = TargetOpcode::G_ZEXT;
1908     CarryIn = MI.getOperand(4).getReg();
1909     break;
1910   case TargetOpcode::G_USUBE:
1911     Opcode = TargetOpcode::G_USUBE;
1912     ExtOpcode = TargetOpcode::G_ZEXT;
1913     CarryIn = MI.getOperand(4).getReg();
1914     break;
1915   }
1916 
1917   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1918   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1919   // Do the arithmetic in the larger type.
1920   Register NewOp;
1921   if (CarryIn) {
1922     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1923     NewOp = MIRBuilder
1924                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1925                             {LHSExt, RHSExt, *CarryIn})
1926                 .getReg(0);
1927   } else {
1928     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1929   }
1930   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1931   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1932   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1933   // There is no overflow if the ExtOp is the same as NewOp.
1934   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1935   // Now trunc the NewOp to the original result.
1936   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1937   MI.eraseFromParent();
1938   return Legalized;
1939 }
1940 
1941 LegalizerHelper::LegalizeResult
1942 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1943                                          LLT WideTy) {
1944   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1945                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1946                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1947   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1948                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1949   // We can convert this to:
1950   //   1. Any extend iN to iM
1951   //   2. SHL by M-N
1952   //   3. [US][ADD|SUB|SHL]SAT
1953   //   4. L/ASHR by M-N
1954   //
1955   // It may be more efficient to lower this to a min and a max operation in
1956   // the higher precision arithmetic if the promoted operation isn't legal,
1957   // but this decision is up to the target's lowering request.
1958   Register DstReg = MI.getOperand(0).getReg();
1959 
1960   unsigned NewBits = WideTy.getScalarSizeInBits();
1961   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1962 
1963   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1964   // must not left shift the RHS to preserve the shift amount.
1965   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1966   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1967                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1968   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1969   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1970   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1971 
1972   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1973                                         {ShiftL, ShiftR}, MI.getFlags());
1974 
1975   // Use a shift that will preserve the number of sign bits when the trunc is
1976   // folded away.
1977   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1978                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1979 
1980   MIRBuilder.buildTrunc(DstReg, Result);
1981   MI.eraseFromParent();
1982   return Legalized;
1983 }
1984 
1985 LegalizerHelper::LegalizeResult
1986 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1987                                  LLT WideTy) {
1988   if (TypeIdx == 1)
1989     return UnableToLegalize;
1990 
1991   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1992   Register Result = MI.getOperand(0).getReg();
1993   Register OriginalOverflow = MI.getOperand(1).getReg();
1994   Register LHS = MI.getOperand(2).getReg();
1995   Register RHS = MI.getOperand(3).getReg();
1996   LLT SrcTy = MRI.getType(LHS);
1997   LLT OverflowTy = MRI.getType(OriginalOverflow);
1998   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1999 
2000   // To determine if the result overflowed in the larger type, we extend the
2001   // input to the larger type, do the multiply (checking if it overflows),
2002   // then also check the high bits of the result to see if overflow happened
2003   // there.
2004   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2005   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
2006   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
2007 
2008   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
2009                                     {LeftOperand, RightOperand});
2010   auto Mul = Mulo->getOperand(0);
2011   MIRBuilder.buildTrunc(Result, Mul);
2012 
2013   MachineInstrBuilder ExtResult;
2014   // Overflow occurred if it occurred in the larger type, or if the high part
2015   // of the result does not zero/sign-extend the low part.  Check this second
2016   // possibility first.
2017   if (IsSigned) {
2018     // For signed, overflow occurred when the high part does not sign-extend
2019     // the low part.
2020     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2021   } else {
2022     // Unsigned overflow occurred when the high part does not zero-extend the
2023     // low part.
2024     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2025   }
2026 
2027   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2028   // so we don't need to check the overflow result of larger type Mulo.
2029   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
2030     auto Overflow =
2031         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2032     // Finally check if the multiplication in the larger type itself overflowed.
2033     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2034   } else {
2035     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2036   }
2037   MI.eraseFromParent();
2038   return Legalized;
2039 }
2040 
2041 LegalizerHelper::LegalizeResult
2042 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2043   switch (MI.getOpcode()) {
2044   default:
2045     return UnableToLegalize;
2046   case TargetOpcode::G_ATOMICRMW_XCHG:
2047   case TargetOpcode::G_ATOMICRMW_ADD:
2048   case TargetOpcode::G_ATOMICRMW_SUB:
2049   case TargetOpcode::G_ATOMICRMW_AND:
2050   case TargetOpcode::G_ATOMICRMW_OR:
2051   case TargetOpcode::G_ATOMICRMW_XOR:
2052   case TargetOpcode::G_ATOMICRMW_MIN:
2053   case TargetOpcode::G_ATOMICRMW_MAX:
2054   case TargetOpcode::G_ATOMICRMW_UMIN:
2055   case TargetOpcode::G_ATOMICRMW_UMAX:
2056     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2057     Observer.changingInstr(MI);
2058     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2059     widenScalarDst(MI, WideTy, 0);
2060     Observer.changedInstr(MI);
2061     return Legalized;
2062   case TargetOpcode::G_ATOMIC_CMPXCHG:
2063     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2064     Observer.changingInstr(MI);
2065     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2066     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2067     widenScalarDst(MI, WideTy, 0);
2068     Observer.changedInstr(MI);
2069     return Legalized;
2070   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2071     if (TypeIdx == 0) {
2072       Observer.changingInstr(MI);
2073       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2074       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2075       widenScalarDst(MI, WideTy, 0);
2076       Observer.changedInstr(MI);
2077       return Legalized;
2078     }
2079     assert(TypeIdx == 1 &&
2080            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2081     Observer.changingInstr(MI);
2082     widenScalarDst(MI, WideTy, 1);
2083     Observer.changedInstr(MI);
2084     return Legalized;
2085   case TargetOpcode::G_EXTRACT:
2086     return widenScalarExtract(MI, TypeIdx, WideTy);
2087   case TargetOpcode::G_INSERT:
2088     return widenScalarInsert(MI, TypeIdx, WideTy);
2089   case TargetOpcode::G_MERGE_VALUES:
2090     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2091   case TargetOpcode::G_UNMERGE_VALUES:
2092     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2093   case TargetOpcode::G_SADDO:
2094   case TargetOpcode::G_SSUBO:
2095   case TargetOpcode::G_UADDO:
2096   case TargetOpcode::G_USUBO:
2097   case TargetOpcode::G_SADDE:
2098   case TargetOpcode::G_SSUBE:
2099   case TargetOpcode::G_UADDE:
2100   case TargetOpcode::G_USUBE:
2101     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2102   case TargetOpcode::G_UMULO:
2103   case TargetOpcode::G_SMULO:
2104     return widenScalarMulo(MI, TypeIdx, WideTy);
2105   case TargetOpcode::G_SADDSAT:
2106   case TargetOpcode::G_SSUBSAT:
2107   case TargetOpcode::G_SSHLSAT:
2108   case TargetOpcode::G_UADDSAT:
2109   case TargetOpcode::G_USUBSAT:
2110   case TargetOpcode::G_USHLSAT:
2111     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2112   case TargetOpcode::G_CTTZ:
2113   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2114   case TargetOpcode::G_CTLZ:
2115   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2116   case TargetOpcode::G_CTPOP: {
2117     if (TypeIdx == 0) {
2118       Observer.changingInstr(MI);
2119       widenScalarDst(MI, WideTy, 0);
2120       Observer.changedInstr(MI);
2121       return Legalized;
2122     }
2123 
2124     Register SrcReg = MI.getOperand(1).getReg();
2125 
2126     // First extend the input.
2127     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2128                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2129                           ? TargetOpcode::G_ANYEXT
2130                           : TargetOpcode::G_ZEXT;
2131     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2132     LLT CurTy = MRI.getType(SrcReg);
2133     unsigned NewOpc = MI.getOpcode();
2134     if (NewOpc == TargetOpcode::G_CTTZ) {
2135       // The count is the same in the larger type except if the original
2136       // value was zero.  This can be handled by setting the bit just off
2137       // the top of the original type.
2138       auto TopBit =
2139           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2140       MIBSrc = MIRBuilder.buildOr(
2141         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2142       // Now we know the operand is non-zero, use the more relaxed opcode.
2143       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2144     }
2145 
2146     // Perform the operation at the larger size.
2147     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2148     // This is already the correct result for CTPOP and CTTZs
2149     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2150         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2151       // The correct result is NewOp - (Difference in widety and current ty).
2152       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2153       MIBNewOp = MIRBuilder.buildSub(
2154           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2155     }
2156 
2157     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2158     MI.eraseFromParent();
2159     return Legalized;
2160   }
2161   case TargetOpcode::G_BSWAP: {
2162     Observer.changingInstr(MI);
2163     Register DstReg = MI.getOperand(0).getReg();
2164 
2165     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2166     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2167     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2168     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2169 
2170     MI.getOperand(0).setReg(DstExt);
2171 
2172     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2173 
2174     LLT Ty = MRI.getType(DstReg);
2175     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2176     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2177     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2178 
2179     MIRBuilder.buildTrunc(DstReg, ShrReg);
2180     Observer.changedInstr(MI);
2181     return Legalized;
2182   }
2183   case TargetOpcode::G_BITREVERSE: {
2184     Observer.changingInstr(MI);
2185 
2186     Register DstReg = MI.getOperand(0).getReg();
2187     LLT Ty = MRI.getType(DstReg);
2188     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2189 
2190     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2191     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2192     MI.getOperand(0).setReg(DstExt);
2193     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2194 
2195     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2196     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2197     MIRBuilder.buildTrunc(DstReg, Shift);
2198     Observer.changedInstr(MI);
2199     return Legalized;
2200   }
2201   case TargetOpcode::G_FREEZE:
2202     Observer.changingInstr(MI);
2203     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2204     widenScalarDst(MI, WideTy);
2205     Observer.changedInstr(MI);
2206     return Legalized;
2207 
2208   case TargetOpcode::G_ABS:
2209     Observer.changingInstr(MI);
2210     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2211     widenScalarDst(MI, WideTy);
2212     Observer.changedInstr(MI);
2213     return Legalized;
2214 
2215   case TargetOpcode::G_ADD:
2216   case TargetOpcode::G_AND:
2217   case TargetOpcode::G_MUL:
2218   case TargetOpcode::G_OR:
2219   case TargetOpcode::G_XOR:
2220   case TargetOpcode::G_SUB:
2221     // Perform operation at larger width (any extension is fines here, high bits
2222     // don't affect the result) and then truncate the result back to the
2223     // original type.
2224     Observer.changingInstr(MI);
2225     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2226     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2227     widenScalarDst(MI, WideTy);
2228     Observer.changedInstr(MI);
2229     return Legalized;
2230 
2231   case TargetOpcode::G_SBFX:
2232   case TargetOpcode::G_UBFX:
2233     Observer.changingInstr(MI);
2234 
2235     if (TypeIdx == 0) {
2236       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2237       widenScalarDst(MI, WideTy);
2238     } else {
2239       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2240       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2241     }
2242 
2243     Observer.changedInstr(MI);
2244     return Legalized;
2245 
2246   case TargetOpcode::G_SHL:
2247     Observer.changingInstr(MI);
2248 
2249     if (TypeIdx == 0) {
2250       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2251       widenScalarDst(MI, WideTy);
2252     } else {
2253       assert(TypeIdx == 1);
2254       // The "number of bits to shift" operand must preserve its value as an
2255       // unsigned integer:
2256       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2257     }
2258 
2259     Observer.changedInstr(MI);
2260     return Legalized;
2261 
2262   case TargetOpcode::G_SDIV:
2263   case TargetOpcode::G_SREM:
2264   case TargetOpcode::G_SMIN:
2265   case TargetOpcode::G_SMAX:
2266     Observer.changingInstr(MI);
2267     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2268     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2269     widenScalarDst(MI, WideTy);
2270     Observer.changedInstr(MI);
2271     return Legalized;
2272 
2273   case TargetOpcode::G_SDIVREM:
2274     Observer.changingInstr(MI);
2275     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2276     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2277     widenScalarDst(MI, WideTy);
2278     widenScalarDst(MI, WideTy, 1);
2279     Observer.changedInstr(MI);
2280     return Legalized;
2281 
2282   case TargetOpcode::G_ASHR:
2283   case TargetOpcode::G_LSHR:
2284     Observer.changingInstr(MI);
2285 
2286     if (TypeIdx == 0) {
2287       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2288         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2289 
2290       widenScalarSrc(MI, WideTy, 1, CvtOp);
2291       widenScalarDst(MI, WideTy);
2292     } else {
2293       assert(TypeIdx == 1);
2294       // The "number of bits to shift" operand must preserve its value as an
2295       // unsigned integer:
2296       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2297     }
2298 
2299     Observer.changedInstr(MI);
2300     return Legalized;
2301   case TargetOpcode::G_UDIV:
2302   case TargetOpcode::G_UREM:
2303   case TargetOpcode::G_UMIN:
2304   case TargetOpcode::G_UMAX:
2305     Observer.changingInstr(MI);
2306     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2307     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2308     widenScalarDst(MI, WideTy);
2309     Observer.changedInstr(MI);
2310     return Legalized;
2311 
2312   case TargetOpcode::G_UDIVREM:
2313     Observer.changingInstr(MI);
2314     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2315     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2316     widenScalarDst(MI, WideTy);
2317     widenScalarDst(MI, WideTy, 1);
2318     Observer.changedInstr(MI);
2319     return Legalized;
2320 
2321   case TargetOpcode::G_SELECT:
2322     Observer.changingInstr(MI);
2323     if (TypeIdx == 0) {
2324       // Perform operation at larger width (any extension is fine here, high
2325       // bits don't affect the result) and then truncate the result back to the
2326       // original type.
2327       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2328       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2329       widenScalarDst(MI, WideTy);
2330     } else {
2331       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2332       // Explicit extension is required here since high bits affect the result.
2333       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2334     }
2335     Observer.changedInstr(MI);
2336     return Legalized;
2337 
2338   case TargetOpcode::G_FPTOSI:
2339   case TargetOpcode::G_FPTOUI:
2340     Observer.changingInstr(MI);
2341 
2342     if (TypeIdx == 0)
2343       widenScalarDst(MI, WideTy);
2344     else
2345       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2346 
2347     Observer.changedInstr(MI);
2348     return Legalized;
2349   case TargetOpcode::G_SITOFP:
2350     Observer.changingInstr(MI);
2351 
2352     if (TypeIdx == 0)
2353       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2354     else
2355       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2356 
2357     Observer.changedInstr(MI);
2358     return Legalized;
2359   case TargetOpcode::G_UITOFP:
2360     Observer.changingInstr(MI);
2361 
2362     if (TypeIdx == 0)
2363       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2364     else
2365       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2366 
2367     Observer.changedInstr(MI);
2368     return Legalized;
2369   case TargetOpcode::G_LOAD:
2370   case TargetOpcode::G_SEXTLOAD:
2371   case TargetOpcode::G_ZEXTLOAD:
2372     Observer.changingInstr(MI);
2373     widenScalarDst(MI, WideTy);
2374     Observer.changedInstr(MI);
2375     return Legalized;
2376 
2377   case TargetOpcode::G_STORE: {
2378     if (TypeIdx != 0)
2379       return UnableToLegalize;
2380 
2381     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2382     if (!Ty.isScalar())
2383       return UnableToLegalize;
2384 
2385     Observer.changingInstr(MI);
2386 
2387     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2388       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2389     widenScalarSrc(MI, WideTy, 0, ExtType);
2390 
2391     Observer.changedInstr(MI);
2392     return Legalized;
2393   }
2394   case TargetOpcode::G_CONSTANT: {
2395     MachineOperand &SrcMO = MI.getOperand(1);
2396     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2397     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2398         MRI.getType(MI.getOperand(0).getReg()));
2399     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2400             ExtOpc == TargetOpcode::G_ANYEXT) &&
2401            "Illegal Extend");
2402     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2403     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2404                            ? SrcVal.sext(WideTy.getSizeInBits())
2405                            : SrcVal.zext(WideTy.getSizeInBits());
2406     Observer.changingInstr(MI);
2407     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2408 
2409     widenScalarDst(MI, WideTy);
2410     Observer.changedInstr(MI);
2411     return Legalized;
2412   }
2413   case TargetOpcode::G_FCONSTANT: {
2414     MachineOperand &SrcMO = MI.getOperand(1);
2415     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2416     APFloat Val = SrcMO.getFPImm()->getValueAPF();
2417     bool LosesInfo;
2418     switch (WideTy.getSizeInBits()) {
2419     case 32:
2420       Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
2421                   &LosesInfo);
2422       break;
2423     case 64:
2424       Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
2425                   &LosesInfo);
2426       break;
2427     default:
2428       return UnableToLegalize;
2429     }
2430 
2431     assert(!LosesInfo && "extend should always be lossless");
2432 
2433     Observer.changingInstr(MI);
2434     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
2435 
2436     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2437     Observer.changedInstr(MI);
2438     return Legalized;
2439   }
2440   case TargetOpcode::G_IMPLICIT_DEF: {
2441     Observer.changingInstr(MI);
2442     widenScalarDst(MI, WideTy);
2443     Observer.changedInstr(MI);
2444     return Legalized;
2445   }
2446   case TargetOpcode::G_BRCOND:
2447     Observer.changingInstr(MI);
2448     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2449     Observer.changedInstr(MI);
2450     return Legalized;
2451 
2452   case TargetOpcode::G_FCMP:
2453     Observer.changingInstr(MI);
2454     if (TypeIdx == 0)
2455       widenScalarDst(MI, WideTy);
2456     else {
2457       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2458       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2459     }
2460     Observer.changedInstr(MI);
2461     return Legalized;
2462 
2463   case TargetOpcode::G_ICMP:
2464     Observer.changingInstr(MI);
2465     if (TypeIdx == 0)
2466       widenScalarDst(MI, WideTy);
2467     else {
2468       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2469                                MI.getOperand(1).getPredicate()))
2470                                ? TargetOpcode::G_SEXT
2471                                : TargetOpcode::G_ZEXT;
2472       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2473       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2474     }
2475     Observer.changedInstr(MI);
2476     return Legalized;
2477 
2478   case TargetOpcode::G_PTR_ADD:
2479     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2480     Observer.changingInstr(MI);
2481     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2482     Observer.changedInstr(MI);
2483     return Legalized;
2484 
2485   case TargetOpcode::G_PHI: {
2486     assert(TypeIdx == 0 && "Expecting only Idx 0");
2487 
2488     Observer.changingInstr(MI);
2489     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2490       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2491       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2492       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2493     }
2494 
2495     MachineBasicBlock &MBB = *MI.getParent();
2496     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2497     widenScalarDst(MI, WideTy);
2498     Observer.changedInstr(MI);
2499     return Legalized;
2500   }
2501   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2502     if (TypeIdx == 0) {
2503       Register VecReg = MI.getOperand(1).getReg();
2504       LLT VecTy = MRI.getType(VecReg);
2505       Observer.changingInstr(MI);
2506 
2507       widenScalarSrc(
2508           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2509           TargetOpcode::G_ANYEXT);
2510 
2511       widenScalarDst(MI, WideTy, 0);
2512       Observer.changedInstr(MI);
2513       return Legalized;
2514     }
2515 
2516     if (TypeIdx != 2)
2517       return UnableToLegalize;
2518     Observer.changingInstr(MI);
2519     // TODO: Probably should be zext
2520     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2521     Observer.changedInstr(MI);
2522     return Legalized;
2523   }
2524   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2525     if (TypeIdx == 1) {
2526       Observer.changingInstr(MI);
2527 
2528       Register VecReg = MI.getOperand(1).getReg();
2529       LLT VecTy = MRI.getType(VecReg);
2530       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2531 
2532       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2533       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2534       widenScalarDst(MI, WideVecTy, 0);
2535       Observer.changedInstr(MI);
2536       return Legalized;
2537     }
2538 
2539     if (TypeIdx == 2) {
2540       Observer.changingInstr(MI);
2541       // TODO: Probably should be zext
2542       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2543       Observer.changedInstr(MI);
2544       return Legalized;
2545     }
2546 
2547     return UnableToLegalize;
2548   }
2549   case TargetOpcode::G_FADD:
2550   case TargetOpcode::G_FMUL:
2551   case TargetOpcode::G_FSUB:
2552   case TargetOpcode::G_FMA:
2553   case TargetOpcode::G_FMAD:
2554   case TargetOpcode::G_FNEG:
2555   case TargetOpcode::G_FABS:
2556   case TargetOpcode::G_FCANONICALIZE:
2557   case TargetOpcode::G_FMINNUM:
2558   case TargetOpcode::G_FMAXNUM:
2559   case TargetOpcode::G_FMINNUM_IEEE:
2560   case TargetOpcode::G_FMAXNUM_IEEE:
2561   case TargetOpcode::G_FMINIMUM:
2562   case TargetOpcode::G_FMAXIMUM:
2563   case TargetOpcode::G_FDIV:
2564   case TargetOpcode::G_FREM:
2565   case TargetOpcode::G_FCEIL:
2566   case TargetOpcode::G_FFLOOR:
2567   case TargetOpcode::G_FCOS:
2568   case TargetOpcode::G_FSIN:
2569   case TargetOpcode::G_FLOG10:
2570   case TargetOpcode::G_FLOG:
2571   case TargetOpcode::G_FLOG2:
2572   case TargetOpcode::G_FRINT:
2573   case TargetOpcode::G_FNEARBYINT:
2574   case TargetOpcode::G_FSQRT:
2575   case TargetOpcode::G_FEXP:
2576   case TargetOpcode::G_FEXP2:
2577   case TargetOpcode::G_FPOW:
2578   case TargetOpcode::G_INTRINSIC_TRUNC:
2579   case TargetOpcode::G_INTRINSIC_ROUND:
2580   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2581     assert(TypeIdx == 0);
2582     Observer.changingInstr(MI);
2583 
2584     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2585       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2586 
2587     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2588     Observer.changedInstr(MI);
2589     return Legalized;
2590   case TargetOpcode::G_FPOWI: {
2591     if (TypeIdx != 0)
2592       return UnableToLegalize;
2593     Observer.changingInstr(MI);
2594     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2595     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2596     Observer.changedInstr(MI);
2597     return Legalized;
2598   }
2599   case TargetOpcode::G_INTTOPTR:
2600     if (TypeIdx != 1)
2601       return UnableToLegalize;
2602 
2603     Observer.changingInstr(MI);
2604     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2605     Observer.changedInstr(MI);
2606     return Legalized;
2607   case TargetOpcode::G_PTRTOINT:
2608     if (TypeIdx != 0)
2609       return UnableToLegalize;
2610 
2611     Observer.changingInstr(MI);
2612     widenScalarDst(MI, WideTy, 0);
2613     Observer.changedInstr(MI);
2614     return Legalized;
2615   case TargetOpcode::G_BUILD_VECTOR: {
2616     Observer.changingInstr(MI);
2617 
2618     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2619     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2620       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2621 
2622     // Avoid changing the result vector type if the source element type was
2623     // requested.
2624     if (TypeIdx == 1) {
2625       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2626     } else {
2627       widenScalarDst(MI, WideTy, 0);
2628     }
2629 
2630     Observer.changedInstr(MI);
2631     return Legalized;
2632   }
2633   case TargetOpcode::G_SEXT_INREG:
2634     if (TypeIdx != 0)
2635       return UnableToLegalize;
2636 
2637     Observer.changingInstr(MI);
2638     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2639     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2640     Observer.changedInstr(MI);
2641     return Legalized;
2642   case TargetOpcode::G_PTRMASK: {
2643     if (TypeIdx != 1)
2644       return UnableToLegalize;
2645     Observer.changingInstr(MI);
2646     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2647     Observer.changedInstr(MI);
2648     return Legalized;
2649   }
2650   }
2651 }
2652 
2653 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2654                              MachineIRBuilder &B, Register Src, LLT Ty) {
2655   auto Unmerge = B.buildUnmerge(Ty, Src);
2656   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2657     Pieces.push_back(Unmerge.getReg(I));
2658 }
2659 
2660 LegalizerHelper::LegalizeResult
2661 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2662   Register Dst = MI.getOperand(0).getReg();
2663   Register Src = MI.getOperand(1).getReg();
2664   LLT DstTy = MRI.getType(Dst);
2665   LLT SrcTy = MRI.getType(Src);
2666 
2667   if (SrcTy.isVector()) {
2668     LLT SrcEltTy = SrcTy.getElementType();
2669     SmallVector<Register, 8> SrcRegs;
2670 
2671     if (DstTy.isVector()) {
2672       int NumDstElt = DstTy.getNumElements();
2673       int NumSrcElt = SrcTy.getNumElements();
2674 
2675       LLT DstEltTy = DstTy.getElementType();
2676       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2677       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2678 
2679       // If there's an element size mismatch, insert intermediate casts to match
2680       // the result element type.
2681       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2682         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2683         //
2684         // =>
2685         //
2686         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2687         // %3:_(<2 x s8>) = G_BITCAST %2
2688         // %4:_(<2 x s8>) = G_BITCAST %3
2689         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2690         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2691         SrcPartTy = SrcEltTy;
2692       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2693         //
2694         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2695         //
2696         // =>
2697         //
2698         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2699         // %3:_(s16) = G_BITCAST %2
2700         // %4:_(s16) = G_BITCAST %3
2701         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2702         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2703         DstCastTy = DstEltTy;
2704       }
2705 
2706       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2707       for (Register &SrcReg : SrcRegs)
2708         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2709     } else
2710       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2711 
2712     MIRBuilder.buildMerge(Dst, SrcRegs);
2713     MI.eraseFromParent();
2714     return Legalized;
2715   }
2716 
2717   if (DstTy.isVector()) {
2718     SmallVector<Register, 8> SrcRegs;
2719     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2720     MIRBuilder.buildMerge(Dst, SrcRegs);
2721     MI.eraseFromParent();
2722     return Legalized;
2723   }
2724 
2725   return UnableToLegalize;
2726 }
2727 
2728 /// Figure out the bit offset into a register when coercing a vector index for
2729 /// the wide element type. This is only for the case when promoting vector to
2730 /// one with larger elements.
2731 //
2732 ///
2733 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2734 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2735 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2736                                                    Register Idx,
2737                                                    unsigned NewEltSize,
2738                                                    unsigned OldEltSize) {
2739   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2740   LLT IdxTy = B.getMRI()->getType(Idx);
2741 
2742   // Now figure out the amount we need to shift to get the target bits.
2743   auto OffsetMask = B.buildConstant(
2744       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
2745   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2746   return B.buildShl(IdxTy, OffsetIdx,
2747                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2748 }
2749 
2750 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2751 /// is casting to a vector with a smaller element size, perform multiple element
2752 /// extracts and merge the results. If this is coercing to a vector with larger
2753 /// elements, index the bitcasted vector and extract the target element with bit
2754 /// operations. This is intended to force the indexing in the native register
2755 /// size for architectures that can dynamically index the register file.
2756 LegalizerHelper::LegalizeResult
2757 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2758                                          LLT CastTy) {
2759   if (TypeIdx != 1)
2760     return UnableToLegalize;
2761 
2762   Register Dst = MI.getOperand(0).getReg();
2763   Register SrcVec = MI.getOperand(1).getReg();
2764   Register Idx = MI.getOperand(2).getReg();
2765   LLT SrcVecTy = MRI.getType(SrcVec);
2766   LLT IdxTy = MRI.getType(Idx);
2767 
2768   LLT SrcEltTy = SrcVecTy.getElementType();
2769   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2770   unsigned OldNumElts = SrcVecTy.getNumElements();
2771 
2772   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2773   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2774 
2775   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2776   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2777   if (NewNumElts > OldNumElts) {
2778     // Decreasing the vector element size
2779     //
2780     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2781     //  =>
2782     //  v4i32:castx = bitcast x:v2i64
2783     //
2784     // i64 = bitcast
2785     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2786     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2787     //
2788     if (NewNumElts % OldNumElts != 0)
2789       return UnableToLegalize;
2790 
2791     // Type of the intermediate result vector.
2792     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2793     LLT MidTy =
2794         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2795 
2796     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2797 
2798     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2799     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2800 
2801     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2802       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2803       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2804       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2805       NewOps[I] = Elt.getReg(0);
2806     }
2807 
2808     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2809     MIRBuilder.buildBitcast(Dst, NewVec);
2810     MI.eraseFromParent();
2811     return Legalized;
2812   }
2813 
2814   if (NewNumElts < OldNumElts) {
2815     if (NewEltSize % OldEltSize != 0)
2816       return UnableToLegalize;
2817 
2818     // This only depends on powers of 2 because we use bit tricks to figure out
2819     // the bit offset we need to shift to get the target element. A general
2820     // expansion could emit division/multiply.
2821     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2822       return UnableToLegalize;
2823 
2824     // Increasing the vector element size.
2825     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2826     //
2827     //   =>
2828     //
2829     // %cast = G_BITCAST %vec
2830     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2831     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2832     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2833     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2834     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2835     // %elt = G_TRUNC %elt_bits
2836 
2837     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2838     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2839 
2840     // Divide to get the index in the wider element type.
2841     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2842 
2843     Register WideElt = CastVec;
2844     if (CastTy.isVector()) {
2845       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2846                                                      ScaledIdx).getReg(0);
2847     }
2848 
2849     // Compute the bit offset into the register of the target element.
2850     Register OffsetBits = getBitcastWiderVectorElementOffset(
2851       MIRBuilder, Idx, NewEltSize, OldEltSize);
2852 
2853     // Shift the wide element to get the target element.
2854     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2855     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2856     MI.eraseFromParent();
2857     return Legalized;
2858   }
2859 
2860   return UnableToLegalize;
2861 }
2862 
2863 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2864 /// TargetReg, while preserving other bits in \p TargetReg.
2865 ///
2866 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2867 static Register buildBitFieldInsert(MachineIRBuilder &B,
2868                                     Register TargetReg, Register InsertReg,
2869                                     Register OffsetBits) {
2870   LLT TargetTy = B.getMRI()->getType(TargetReg);
2871   LLT InsertTy = B.getMRI()->getType(InsertReg);
2872   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2873   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2874 
2875   // Produce a bitmask of the value to insert
2876   auto EltMask = B.buildConstant(
2877     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2878                                    InsertTy.getSizeInBits()));
2879   // Shift it into position
2880   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2881   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2882 
2883   // Clear out the bits in the wide element
2884   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2885 
2886   // The value to insert has all zeros already, so stick it into the masked
2887   // wide element.
2888   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2889 }
2890 
2891 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2892 /// is increasing the element size, perform the indexing in the target element
2893 /// type, and use bit operations to insert at the element position. This is
2894 /// intended for architectures that can dynamically index the register file and
2895 /// want to force indexing in the native register size.
2896 LegalizerHelper::LegalizeResult
2897 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2898                                         LLT CastTy) {
2899   if (TypeIdx != 0)
2900     return UnableToLegalize;
2901 
2902   Register Dst = MI.getOperand(0).getReg();
2903   Register SrcVec = MI.getOperand(1).getReg();
2904   Register Val = MI.getOperand(2).getReg();
2905   Register Idx = MI.getOperand(3).getReg();
2906 
2907   LLT VecTy = MRI.getType(Dst);
2908   LLT IdxTy = MRI.getType(Idx);
2909 
2910   LLT VecEltTy = VecTy.getElementType();
2911   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2912   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2913   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2914 
2915   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2916   unsigned OldNumElts = VecTy.getNumElements();
2917 
2918   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2919   if (NewNumElts < OldNumElts) {
2920     if (NewEltSize % OldEltSize != 0)
2921       return UnableToLegalize;
2922 
2923     // This only depends on powers of 2 because we use bit tricks to figure out
2924     // the bit offset we need to shift to get the target element. A general
2925     // expansion could emit division/multiply.
2926     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2927       return UnableToLegalize;
2928 
2929     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2930     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2931 
2932     // Divide to get the index in the wider element type.
2933     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2934 
2935     Register ExtractedElt = CastVec;
2936     if (CastTy.isVector()) {
2937       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2938                                                           ScaledIdx).getReg(0);
2939     }
2940 
2941     // Compute the bit offset into the register of the target element.
2942     Register OffsetBits = getBitcastWiderVectorElementOffset(
2943       MIRBuilder, Idx, NewEltSize, OldEltSize);
2944 
2945     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2946                                                Val, OffsetBits);
2947     if (CastTy.isVector()) {
2948       InsertedElt = MIRBuilder.buildInsertVectorElement(
2949         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2950     }
2951 
2952     MIRBuilder.buildBitcast(Dst, InsertedElt);
2953     MI.eraseFromParent();
2954     return Legalized;
2955   }
2956 
2957   return UnableToLegalize;
2958 }
2959 
2960 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2961   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2962   Register DstReg = LoadMI.getDstReg();
2963   Register PtrReg = LoadMI.getPointerReg();
2964   LLT DstTy = MRI.getType(DstReg);
2965   MachineMemOperand &MMO = LoadMI.getMMO();
2966   LLT MemTy = MMO.getMemoryType();
2967   MachineFunction &MF = MIRBuilder.getMF();
2968 
2969   unsigned MemSizeInBits = MemTy.getSizeInBits();
2970   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2971 
2972   if (MemSizeInBits != MemStoreSizeInBits) {
2973     if (MemTy.isVector())
2974       return UnableToLegalize;
2975 
2976     // Promote to a byte-sized load if not loading an integral number of
2977     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2978     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2979     MachineMemOperand *NewMMO =
2980         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2981 
2982     Register LoadReg = DstReg;
2983     LLT LoadTy = DstTy;
2984 
2985     // If this wasn't already an extending load, we need to widen the result
2986     // register to avoid creating a load with a narrower result than the source.
2987     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2988       LoadTy = WideMemTy;
2989       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2990     }
2991 
2992     if (isa<GSExtLoad>(LoadMI)) {
2993       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2994       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2995     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) {
2996       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2997       // The extra bits are guaranteed to be zero, since we stored them that
2998       // way.  A zext load from Wide thus automatically gives zext from MemVT.
2999       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
3000     } else {
3001       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
3002     }
3003 
3004     if (DstTy != LoadTy)
3005       MIRBuilder.buildTrunc(DstReg, LoadReg);
3006 
3007     LoadMI.eraseFromParent();
3008     return Legalized;
3009   }
3010 
3011   // Big endian lowering not implemented.
3012   if (MIRBuilder.getDataLayout().isBigEndian())
3013     return UnableToLegalize;
3014 
3015   // This load needs splitting into power of 2 sized loads.
3016   //
3017   // Our strategy here is to generate anyextending loads for the smaller
3018   // types up to next power-2 result type, and then combine the two larger
3019   // result values together, before truncating back down to the non-pow-2
3020   // type.
3021   // E.g. v1 = i24 load =>
3022   // v2 = i32 zextload (2 byte)
3023   // v3 = i32 load (1 byte)
3024   // v4 = i32 shl v3, 16
3025   // v5 = i32 or v4, v2
3026   // v1 = i24 trunc v5
3027   // By doing this we generate the correct truncate which should get
3028   // combined away as an artifact with a matching extend.
3029 
3030   uint64_t LargeSplitSize, SmallSplitSize;
3031 
3032   if (!isPowerOf2_32(MemSizeInBits)) {
3033     // This load needs splitting into power of 2 sized loads.
3034     LargeSplitSize = PowerOf2Floor(MemSizeInBits);
3035     SmallSplitSize = MemSizeInBits - LargeSplitSize;
3036   } else {
3037     // This is already a power of 2, but we still need to split this in half.
3038     //
3039     // Assume we're being asked to decompose an unaligned load.
3040     // TODO: If this requires multiple splits, handle them all at once.
3041     auto &Ctx = MF.getFunction().getContext();
3042     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3043       return UnableToLegalize;
3044 
3045     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3046   }
3047 
3048   if (MemTy.isVector()) {
3049     // TODO: Handle vector extloads
3050     if (MemTy != DstTy)
3051       return UnableToLegalize;
3052 
3053     // TODO: We can do better than scalarizing the vector and at least split it
3054     // in half.
3055     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
3056   }
3057 
3058   MachineMemOperand *LargeMMO =
3059       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3060   MachineMemOperand *SmallMMO =
3061       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3062 
3063   LLT PtrTy = MRI.getType(PtrReg);
3064   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
3065   LLT AnyExtTy = LLT::scalar(AnyExtSize);
3066   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
3067                                              PtrReg, *LargeMMO);
3068 
3069   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
3070                                             LargeSplitSize / 8);
3071   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3072   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3073   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3074                                              SmallPtr, *SmallMMO);
3075 
3076   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3077   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3078 
3079   if (AnyExtTy == DstTy)
3080     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3081   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3082     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3083     MIRBuilder.buildTrunc(DstReg, {Or});
3084   } else {
3085     assert(DstTy.isPointer() && "expected pointer");
3086     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3087 
3088     // FIXME: We currently consider this to be illegal for non-integral address
3089     // spaces, but we need still need a way to reinterpret the bits.
3090     MIRBuilder.buildIntToPtr(DstReg, Or);
3091   }
3092 
3093   LoadMI.eraseFromParent();
3094   return Legalized;
3095 }
3096 
3097 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3098   // Lower a non-power of 2 store into multiple pow-2 stores.
3099   // E.g. split an i24 store into an i16 store + i8 store.
3100   // We do this by first extending the stored value to the next largest power
3101   // of 2 type, and then using truncating stores to store the components.
3102   // By doing this, likewise with G_LOAD, generate an extend that can be
3103   // artifact-combined away instead of leaving behind extracts.
3104   Register SrcReg = StoreMI.getValueReg();
3105   Register PtrReg = StoreMI.getPointerReg();
3106   LLT SrcTy = MRI.getType(SrcReg);
3107   MachineFunction &MF = MIRBuilder.getMF();
3108   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3109   LLT MemTy = MMO.getMemoryType();
3110 
3111   unsigned StoreWidth = MemTy.getSizeInBits();
3112   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3113 
3114   if (StoreWidth != StoreSizeInBits) {
3115     if (SrcTy.isVector())
3116       return UnableToLegalize;
3117 
3118     // Promote to a byte-sized store with upper bits zero if not
3119     // storing an integral number of bytes.  For example, promote
3120     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3121     LLT WideTy = LLT::scalar(StoreSizeInBits);
3122 
3123     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3124       // Avoid creating a store with a narrower source than result.
3125       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3126       SrcTy = WideTy;
3127     }
3128 
3129     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3130 
3131     MachineMemOperand *NewMMO =
3132         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3133     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3134     StoreMI.eraseFromParent();
3135     return Legalized;
3136   }
3137 
3138   if (MemTy.isVector()) {
3139     // TODO: Handle vector trunc stores
3140     if (MemTy != SrcTy)
3141       return UnableToLegalize;
3142 
3143     // TODO: We can do better than scalarizing the vector and at least split it
3144     // in half.
3145     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3146   }
3147 
3148   unsigned MemSizeInBits = MemTy.getSizeInBits();
3149   uint64_t LargeSplitSize, SmallSplitSize;
3150 
3151   if (!isPowerOf2_32(MemSizeInBits)) {
3152     LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
3153     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3154   } else {
3155     auto &Ctx = MF.getFunction().getContext();
3156     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3157       return UnableToLegalize; // Don't know what we're being asked to do.
3158 
3159     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3160   }
3161 
3162   // Extend to the next pow-2. If this store was itself the result of lowering,
3163   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3164   // that's wider than the stored size.
3165   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3166   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3167 
3168   if (SrcTy.isPointer()) {
3169     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3170     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3171   }
3172 
3173   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3174 
3175   // Obtain the smaller value by shifting away the larger value.
3176   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3177   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3178 
3179   // Generate the PtrAdd and truncating stores.
3180   LLT PtrTy = MRI.getType(PtrReg);
3181   auto OffsetCst = MIRBuilder.buildConstant(
3182     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3183   auto SmallPtr =
3184     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3185 
3186   MachineMemOperand *LargeMMO =
3187     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3188   MachineMemOperand *SmallMMO =
3189     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3190   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3191   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3192   StoreMI.eraseFromParent();
3193   return Legalized;
3194 }
3195 
3196 LegalizerHelper::LegalizeResult
3197 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3198   switch (MI.getOpcode()) {
3199   case TargetOpcode::G_LOAD: {
3200     if (TypeIdx != 0)
3201       return UnableToLegalize;
3202     MachineMemOperand &MMO = **MI.memoperands_begin();
3203 
3204     // Not sure how to interpret a bitcast of an extending load.
3205     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3206       return UnableToLegalize;
3207 
3208     Observer.changingInstr(MI);
3209     bitcastDst(MI, CastTy, 0);
3210     MMO.setType(CastTy);
3211     Observer.changedInstr(MI);
3212     return Legalized;
3213   }
3214   case TargetOpcode::G_STORE: {
3215     if (TypeIdx != 0)
3216       return UnableToLegalize;
3217 
3218     MachineMemOperand &MMO = **MI.memoperands_begin();
3219 
3220     // Not sure how to interpret a bitcast of a truncating store.
3221     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3222       return UnableToLegalize;
3223 
3224     Observer.changingInstr(MI);
3225     bitcastSrc(MI, CastTy, 0);
3226     MMO.setType(CastTy);
3227     Observer.changedInstr(MI);
3228     return Legalized;
3229   }
3230   case TargetOpcode::G_SELECT: {
3231     if (TypeIdx != 0)
3232       return UnableToLegalize;
3233 
3234     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3235       LLVM_DEBUG(
3236           dbgs() << "bitcast action not implemented for vector select\n");
3237       return UnableToLegalize;
3238     }
3239 
3240     Observer.changingInstr(MI);
3241     bitcastSrc(MI, CastTy, 2);
3242     bitcastSrc(MI, CastTy, 3);
3243     bitcastDst(MI, CastTy, 0);
3244     Observer.changedInstr(MI);
3245     return Legalized;
3246   }
3247   case TargetOpcode::G_AND:
3248   case TargetOpcode::G_OR:
3249   case TargetOpcode::G_XOR: {
3250     Observer.changingInstr(MI);
3251     bitcastSrc(MI, CastTy, 1);
3252     bitcastSrc(MI, CastTy, 2);
3253     bitcastDst(MI, CastTy, 0);
3254     Observer.changedInstr(MI);
3255     return Legalized;
3256   }
3257   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3258     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3259   case TargetOpcode::G_INSERT_VECTOR_ELT:
3260     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3261   default:
3262     return UnableToLegalize;
3263   }
3264 }
3265 
3266 // Legalize an instruction by changing the opcode in place.
3267 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3268     Observer.changingInstr(MI);
3269     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3270     Observer.changedInstr(MI);
3271 }
3272 
3273 LegalizerHelper::LegalizeResult
3274 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3275   using namespace TargetOpcode;
3276 
3277   switch(MI.getOpcode()) {
3278   default:
3279     return UnableToLegalize;
3280   case TargetOpcode::G_BITCAST:
3281     return lowerBitcast(MI);
3282   case TargetOpcode::G_SREM:
3283   case TargetOpcode::G_UREM: {
3284     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3285     auto Quot =
3286         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3287                               {MI.getOperand(1), MI.getOperand(2)});
3288 
3289     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3290     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3291     MI.eraseFromParent();
3292     return Legalized;
3293   }
3294   case TargetOpcode::G_SADDO:
3295   case TargetOpcode::G_SSUBO:
3296     return lowerSADDO_SSUBO(MI);
3297   case TargetOpcode::G_UMULH:
3298   case TargetOpcode::G_SMULH:
3299     return lowerSMULH_UMULH(MI);
3300   case TargetOpcode::G_SMULO:
3301   case TargetOpcode::G_UMULO: {
3302     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3303     // result.
3304     Register Res = MI.getOperand(0).getReg();
3305     Register Overflow = MI.getOperand(1).getReg();
3306     Register LHS = MI.getOperand(2).getReg();
3307     Register RHS = MI.getOperand(3).getReg();
3308     LLT Ty = MRI.getType(Res);
3309 
3310     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3311                           ? TargetOpcode::G_SMULH
3312                           : TargetOpcode::G_UMULH;
3313 
3314     Observer.changingInstr(MI);
3315     const auto &TII = MIRBuilder.getTII();
3316     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3317     MI.RemoveOperand(1);
3318     Observer.changedInstr(MI);
3319 
3320     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3321     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3322 
3323     // Move insert point forward so we can use the Res register if needed.
3324     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3325 
3326     // For *signed* multiply, overflow is detected by checking:
3327     // (hi != (lo >> bitwidth-1))
3328     if (Opcode == TargetOpcode::G_SMULH) {
3329       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3330       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3331       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3332     } else {
3333       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3334     }
3335     return Legalized;
3336   }
3337   case TargetOpcode::G_FNEG: {
3338     Register Res = MI.getOperand(0).getReg();
3339     LLT Ty = MRI.getType(Res);
3340 
3341     // TODO: Handle vector types once we are able to
3342     // represent them.
3343     if (Ty.isVector())
3344       return UnableToLegalize;
3345     auto SignMask =
3346         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3347     Register SubByReg = MI.getOperand(1).getReg();
3348     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3349     MI.eraseFromParent();
3350     return Legalized;
3351   }
3352   case TargetOpcode::G_FSUB: {
3353     Register Res = MI.getOperand(0).getReg();
3354     LLT Ty = MRI.getType(Res);
3355 
3356     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3357     // First, check if G_FNEG is marked as Lower. If so, we may
3358     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3359     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3360       return UnableToLegalize;
3361     Register LHS = MI.getOperand(1).getReg();
3362     Register RHS = MI.getOperand(2).getReg();
3363     Register Neg = MRI.createGenericVirtualRegister(Ty);
3364     MIRBuilder.buildFNeg(Neg, RHS);
3365     MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3366     MI.eraseFromParent();
3367     return Legalized;
3368   }
3369   case TargetOpcode::G_FMAD:
3370     return lowerFMad(MI);
3371   case TargetOpcode::G_FFLOOR:
3372     return lowerFFloor(MI);
3373   case TargetOpcode::G_INTRINSIC_ROUND:
3374     return lowerIntrinsicRound(MI);
3375   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3376     // Since round even is the assumed rounding mode for unconstrained FP
3377     // operations, rint and roundeven are the same operation.
3378     changeOpcode(MI, TargetOpcode::G_FRINT);
3379     return Legalized;
3380   }
3381   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3382     Register OldValRes = MI.getOperand(0).getReg();
3383     Register SuccessRes = MI.getOperand(1).getReg();
3384     Register Addr = MI.getOperand(2).getReg();
3385     Register CmpVal = MI.getOperand(3).getReg();
3386     Register NewVal = MI.getOperand(4).getReg();
3387     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3388                                   **MI.memoperands_begin());
3389     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3390     MI.eraseFromParent();
3391     return Legalized;
3392   }
3393   case TargetOpcode::G_LOAD:
3394   case TargetOpcode::G_SEXTLOAD:
3395   case TargetOpcode::G_ZEXTLOAD:
3396     return lowerLoad(cast<GAnyLoad>(MI));
3397   case TargetOpcode::G_STORE:
3398     return lowerStore(cast<GStore>(MI));
3399   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3400   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3401   case TargetOpcode::G_CTLZ:
3402   case TargetOpcode::G_CTTZ:
3403   case TargetOpcode::G_CTPOP:
3404     return lowerBitCount(MI);
3405   case G_UADDO: {
3406     Register Res = MI.getOperand(0).getReg();
3407     Register CarryOut = MI.getOperand(1).getReg();
3408     Register LHS = MI.getOperand(2).getReg();
3409     Register RHS = MI.getOperand(3).getReg();
3410 
3411     MIRBuilder.buildAdd(Res, LHS, RHS);
3412     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3413 
3414     MI.eraseFromParent();
3415     return Legalized;
3416   }
3417   case G_UADDE: {
3418     Register Res = MI.getOperand(0).getReg();
3419     Register CarryOut = MI.getOperand(1).getReg();
3420     Register LHS = MI.getOperand(2).getReg();
3421     Register RHS = MI.getOperand(3).getReg();
3422     Register CarryIn = MI.getOperand(4).getReg();
3423     LLT Ty = MRI.getType(Res);
3424 
3425     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3426     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3427     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3428     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3429 
3430     MI.eraseFromParent();
3431     return Legalized;
3432   }
3433   case G_USUBO: {
3434     Register Res = MI.getOperand(0).getReg();
3435     Register BorrowOut = MI.getOperand(1).getReg();
3436     Register LHS = MI.getOperand(2).getReg();
3437     Register RHS = MI.getOperand(3).getReg();
3438 
3439     MIRBuilder.buildSub(Res, LHS, RHS);
3440     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3441 
3442     MI.eraseFromParent();
3443     return Legalized;
3444   }
3445   case G_USUBE: {
3446     Register Res = MI.getOperand(0).getReg();
3447     Register BorrowOut = MI.getOperand(1).getReg();
3448     Register LHS = MI.getOperand(2).getReg();
3449     Register RHS = MI.getOperand(3).getReg();
3450     Register BorrowIn = MI.getOperand(4).getReg();
3451     const LLT CondTy = MRI.getType(BorrowOut);
3452     const LLT Ty = MRI.getType(Res);
3453 
3454     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3455     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3456     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3457 
3458     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3459     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3460     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3461 
3462     MI.eraseFromParent();
3463     return Legalized;
3464   }
3465   case G_UITOFP:
3466     return lowerUITOFP(MI);
3467   case G_SITOFP:
3468     return lowerSITOFP(MI);
3469   case G_FPTOUI:
3470     return lowerFPTOUI(MI);
3471   case G_FPTOSI:
3472     return lowerFPTOSI(MI);
3473   case G_FPTRUNC:
3474     return lowerFPTRUNC(MI);
3475   case G_FPOWI:
3476     return lowerFPOWI(MI);
3477   case G_SMIN:
3478   case G_SMAX:
3479   case G_UMIN:
3480   case G_UMAX:
3481     return lowerMinMax(MI);
3482   case G_FCOPYSIGN:
3483     return lowerFCopySign(MI);
3484   case G_FMINNUM:
3485   case G_FMAXNUM:
3486     return lowerFMinNumMaxNum(MI);
3487   case G_MERGE_VALUES:
3488     return lowerMergeValues(MI);
3489   case G_UNMERGE_VALUES:
3490     return lowerUnmergeValues(MI);
3491   case TargetOpcode::G_SEXT_INREG: {
3492     assert(MI.getOperand(2).isImm() && "Expected immediate");
3493     int64_t SizeInBits = MI.getOperand(2).getImm();
3494 
3495     Register DstReg = MI.getOperand(0).getReg();
3496     Register SrcReg = MI.getOperand(1).getReg();
3497     LLT DstTy = MRI.getType(DstReg);
3498     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3499 
3500     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3501     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3502     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3503     MI.eraseFromParent();
3504     return Legalized;
3505   }
3506   case G_EXTRACT_VECTOR_ELT:
3507   case G_INSERT_VECTOR_ELT:
3508     return lowerExtractInsertVectorElt(MI);
3509   case G_SHUFFLE_VECTOR:
3510     return lowerShuffleVector(MI);
3511   case G_DYN_STACKALLOC:
3512     return lowerDynStackAlloc(MI);
3513   case G_EXTRACT:
3514     return lowerExtract(MI);
3515   case G_INSERT:
3516     return lowerInsert(MI);
3517   case G_BSWAP:
3518     return lowerBswap(MI);
3519   case G_BITREVERSE:
3520     return lowerBitreverse(MI);
3521   case G_READ_REGISTER:
3522   case G_WRITE_REGISTER:
3523     return lowerReadWriteRegister(MI);
3524   case G_UADDSAT:
3525   case G_USUBSAT: {
3526     // Try to make a reasonable guess about which lowering strategy to use. The
3527     // target can override this with custom lowering and calling the
3528     // implementation functions.
3529     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3530     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3531       return lowerAddSubSatToMinMax(MI);
3532     return lowerAddSubSatToAddoSubo(MI);
3533   }
3534   case G_SADDSAT:
3535   case G_SSUBSAT: {
3536     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3537 
3538     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3539     // since it's a shorter expansion. However, we would need to figure out the
3540     // preferred boolean type for the carry out for the query.
3541     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3542       return lowerAddSubSatToMinMax(MI);
3543     return lowerAddSubSatToAddoSubo(MI);
3544   }
3545   case G_SSHLSAT:
3546   case G_USHLSAT:
3547     return lowerShlSat(MI);
3548   case G_ABS:
3549     return lowerAbsToAddXor(MI);
3550   case G_SELECT:
3551     return lowerSelect(MI);
3552   case G_SDIVREM:
3553   case G_UDIVREM:
3554     return lowerDIVREM(MI);
3555   case G_FSHL:
3556   case G_FSHR:
3557     return lowerFunnelShift(MI);
3558   case G_ROTL:
3559   case G_ROTR:
3560     return lowerRotate(MI);
3561   case G_MEMSET:
3562   case G_MEMCPY:
3563   case G_MEMMOVE:
3564     return lowerMemCpyFamily(MI);
3565   case G_MEMCPY_INLINE:
3566     return lowerMemcpyInline(MI);
3567   GISEL_VECREDUCE_CASES_NONSEQ
3568     return lowerVectorReduction(MI);
3569   }
3570 }
3571 
3572 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3573                                                   Align MinAlign) const {
3574   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3575   // datalayout for the preferred alignment. Also there should be a target hook
3576   // for this to allow targets to reduce the alignment and ignore the
3577   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3578   // the type.
3579   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3580 }
3581 
3582 MachineInstrBuilder
3583 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3584                                       MachinePointerInfo &PtrInfo) {
3585   MachineFunction &MF = MIRBuilder.getMF();
3586   const DataLayout &DL = MIRBuilder.getDataLayout();
3587   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3588 
3589   unsigned AddrSpace = DL.getAllocaAddrSpace();
3590   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3591 
3592   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3593   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3594 }
3595 
3596 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3597                                         LLT VecTy) {
3598   int64_t IdxVal;
3599   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3600     return IdxReg;
3601 
3602   LLT IdxTy = B.getMRI()->getType(IdxReg);
3603   unsigned NElts = VecTy.getNumElements();
3604   if (isPowerOf2_32(NElts)) {
3605     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3606     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3607   }
3608 
3609   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3610       .getReg(0);
3611 }
3612 
3613 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3614                                                   Register Index) {
3615   LLT EltTy = VecTy.getElementType();
3616 
3617   // Calculate the element offset and add it to the pointer.
3618   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3619   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3620          "Converting bits to bytes lost precision");
3621 
3622   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3623 
3624   LLT IdxTy = MRI.getType(Index);
3625   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3626                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3627 
3628   LLT PtrTy = MRI.getType(VecPtr);
3629   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3630 }
3631 
3632 #ifndef NDEBUG
3633 /// Check that all vector operands have same number of elements. Other operands
3634 /// should be listed in NonVecOp.
3635 static bool hasSameNumEltsOnAllVectorOperands(
3636     GenericMachineInstr &MI, MachineRegisterInfo &MRI,
3637     std::initializer_list<unsigned> NonVecOpIndices) {
3638   if (MI.getNumMemOperands() != 0)
3639     return false;
3640 
3641   LLT VecTy = MRI.getType(MI.getReg(0));
3642   if (!VecTy.isVector())
3643     return false;
3644   unsigned NumElts = VecTy.getNumElements();
3645 
3646   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
3647     MachineOperand &Op = MI.getOperand(OpIdx);
3648     if (!Op.isReg()) {
3649       if (!is_contained(NonVecOpIndices, OpIdx))
3650         return false;
3651       continue;
3652     }
3653 
3654     LLT Ty = MRI.getType(Op.getReg());
3655     if (!Ty.isVector()) {
3656       if (!is_contained(NonVecOpIndices, OpIdx))
3657         return false;
3658       continue;
3659     }
3660 
3661     if (Ty.getNumElements() != NumElts)
3662       return false;
3663   }
3664 
3665   return true;
3666 }
3667 #endif
3668 
3669 /// Fill \p DstOps with DstOps that have same number of elements combined as
3670 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
3671 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
3672 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
3673 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
3674                        unsigned NumElts) {
3675   LLT LeftoverTy;
3676   assert(Ty.isVector() && "Expected vector type");
3677   LLT EltTy = Ty.getElementType();
3678   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
3679   int NumParts, NumLeftover;
3680   std::tie(NumParts, NumLeftover) =
3681       getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
3682 
3683   assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
3684   for (int i = 0; i < NumParts; ++i) {
3685     DstOps.push_back(NarrowTy);
3686   }
3687 
3688   if (LeftoverTy.isValid()) {
3689     assert(NumLeftover == 1 && "expected exactly one leftover");
3690     DstOps.push_back(LeftoverTy);
3691   }
3692 }
3693 
3694 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
3695 /// made from \p Op depending on operand type.
3696 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
3697                            MachineOperand &Op) {
3698   for (unsigned i = 0; i < N; ++i) {
3699     if (Op.isReg())
3700       Ops.push_back(Op.getReg());
3701     else if (Op.isImm())
3702       Ops.push_back(Op.getImm());
3703     else if (Op.isPredicate())
3704       Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
3705     else
3706       llvm_unreachable("Unsupported type");
3707   }
3708 }
3709 
3710 // Handle splitting vector operations which need to have the same number of
3711 // elements in each type index, but each type index may have a different element
3712 // type.
3713 //
3714 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3715 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3716 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3717 //
3718 // Also handles some irregular breakdown cases, e.g.
3719 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3720 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3721 //             s64 = G_SHL s64, s32
3722 LegalizerHelper::LegalizeResult
3723 LegalizerHelper::fewerElementsVectorMultiEltType(
3724     GenericMachineInstr &MI, unsigned NumElts,
3725     std::initializer_list<unsigned> NonVecOpIndices) {
3726   assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
3727          "Non-compatible opcode or not specified non-vector operands");
3728   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3729 
3730   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3731   unsigned NumDefs = MI.getNumDefs();
3732 
3733   // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
3734   // Build instructions with DstOps to use instruction found by CSE directly.
3735   // CSE copies found instruction into given vreg when building with vreg dest.
3736   SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
3737   // Output registers will be taken from created instructions.
3738   SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
3739   for (unsigned i = 0; i < NumDefs; ++i) {
3740     makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
3741   }
3742 
3743   // Split vector input operands into sub-vectors with NumElts elts + Leftover.
3744   // Operands listed in NonVecOpIndices will be used as is without splitting;
3745   // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
3746   // scalar condition (op 1), immediate in sext_inreg (op 2).
3747   SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
3748   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3749        ++UseIdx, ++UseNo) {
3750     if (is_contained(NonVecOpIndices, UseIdx)) {
3751       broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
3752                      MI.getOperand(UseIdx));
3753     } else {
3754       SmallVector<Register, 8> SplitPieces;
3755       extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces);
3756       for (auto Reg : SplitPieces)
3757         InputOpsPieces[UseNo].push_back(Reg);
3758     }
3759   }
3760 
3761   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3762 
3763   // Take i-th piece of each input operand split and build sub-vector/scalar
3764   // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
3765   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3766     SmallVector<DstOp, 2> Defs;
3767     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3768       Defs.push_back(OutputOpsPieces[DstNo][i]);
3769 
3770     SmallVector<SrcOp, 3> Uses;
3771     for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
3772       Uses.push_back(InputOpsPieces[InputNo][i]);
3773 
3774     auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
3775     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3776       OutputRegs[DstNo].push_back(I.getReg(DstNo));
3777   }
3778 
3779   // Merge small outputs into MI's output for each def operand.
3780   if (NumLeftovers) {
3781     for (unsigned i = 0; i < NumDefs; ++i)
3782       mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
3783   } else {
3784     for (unsigned i = 0; i < NumDefs; ++i)
3785       MIRBuilder.buildMerge(MI.getReg(i), OutputRegs[i]);
3786   }
3787 
3788   MI.eraseFromParent();
3789   return Legalized;
3790 }
3791 
3792 LegalizerHelper::LegalizeResult
3793 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
3794                                         unsigned NumElts) {
3795   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3796 
3797   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3798   unsigned NumDefs = MI.getNumDefs();
3799 
3800   SmallVector<DstOp, 8> OutputOpsPieces;
3801   SmallVector<Register, 8> OutputRegs;
3802   makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
3803 
3804   // Instructions that perform register split will be inserted in basic block
3805   // where register is defined (basic block is in the next operand).
3806   SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
3807   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3808        UseIdx += 2, ++UseNo) {
3809     MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
3810     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3811     extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]);
3812   }
3813 
3814   // Build PHIs with fewer elements.
3815   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3816   MIRBuilder.setInsertPt(*MI.getParent(), MI);
3817   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3818     auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
3819     Phi.addDef(
3820         MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
3821     OutputRegs.push_back(Phi.getReg(0));
3822 
3823     for (unsigned j = 0; j < NumInputs / 2; ++j) {
3824       Phi.addUse(InputOpsPieces[j][i]);
3825       Phi.add(MI.getOperand(1 + j * 2 + 1));
3826     }
3827   }
3828 
3829   // Merge small outputs into MI's def.
3830   if (NumLeftovers) {
3831     mergeMixedSubvectors(MI.getReg(0), OutputRegs);
3832   } else {
3833     MIRBuilder.buildMerge(MI.getReg(0), OutputRegs);
3834   }
3835 
3836   MI.eraseFromParent();
3837   return Legalized;
3838 }
3839 
3840 LegalizerHelper::LegalizeResult
3841 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3842                                                   unsigned TypeIdx,
3843                                                   LLT NarrowTy) {
3844   const int NumDst = MI.getNumOperands() - 1;
3845   const Register SrcReg = MI.getOperand(NumDst).getReg();
3846   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3847   LLT SrcTy = MRI.getType(SrcReg);
3848 
3849   if (TypeIdx != 1 || NarrowTy == DstTy)
3850     return UnableToLegalize;
3851 
3852   // Requires compatible types. Otherwise SrcReg should have been defined by
3853   // merge-like instruction that would get artifact combined. Most likely
3854   // instruction that defines SrcReg has to perform more/fewer elements
3855   // legalization compatible with NarrowTy.
3856   assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3857   assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3858 
3859   if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3860       (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
3861     return UnableToLegalize;
3862 
3863   // This is most likely DstTy (smaller then register size) packed in SrcTy
3864   // (larger then register size) and since unmerge was not combined it will be
3865   // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
3866   // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
3867 
3868   // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
3869   //
3870   // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
3871   // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
3872   // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
3873   auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
3874   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3875   const int PartsPerUnmerge = NumDst / NumUnmerge;
3876 
3877   for (int I = 0; I != NumUnmerge; ++I) {
3878     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3879 
3880     for (int J = 0; J != PartsPerUnmerge; ++J)
3881       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3882     MIB.addUse(Unmerge.getReg(I));
3883   }
3884 
3885   MI.eraseFromParent();
3886   return Legalized;
3887 }
3888 
3889 LegalizerHelper::LegalizeResult
3890 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
3891                                           LLT NarrowTy) {
3892   Register DstReg = MI.getOperand(0).getReg();
3893   LLT DstTy = MRI.getType(DstReg);
3894   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
3895   // Requires compatible types. Otherwise user of DstReg did not perform unmerge
3896   // that should have been artifact combined. Most likely instruction that uses
3897   // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
3898   assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3899   assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3900   if (NarrowTy == SrcTy)
3901     return UnableToLegalize;
3902 
3903   // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
3904   // is for old mir tests. Since the changes to more/fewer elements it should no
3905   // longer be possible to generate MIR like this when starting from llvm-ir
3906   // because LCMTy approach was replaced with merge/unmerge to vector elements.
3907   if (TypeIdx == 1) {
3908     assert(SrcTy.isVector() && "Expected vector types");
3909     assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3910     if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3911         (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
3912       return UnableToLegalize;
3913     // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
3914     //
3915     // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
3916     // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
3917     // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
3918     // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
3919     // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
3920     // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
3921 
3922     SmallVector<Register, 8> Elts;
3923     LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
3924     for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
3925       auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
3926       for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
3927         Elts.push_back(Unmerge.getReg(j));
3928     }
3929 
3930     SmallVector<Register, 8> NarrowTyElts;
3931     unsigned NumNarrowTyElts = NarrowTy.getNumElements();
3932     unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
3933     for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
3934          ++i, Offset += NumNarrowTyElts) {
3935       ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
3936       NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
3937     }
3938 
3939     MIRBuilder.buildMerge(DstReg, NarrowTyElts);
3940     MI.eraseFromParent();
3941     return Legalized;
3942   }
3943 
3944   assert(TypeIdx == 0 && "Bad type index");
3945   if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
3946       (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
3947     return UnableToLegalize;
3948 
3949   // This is most likely SrcTy (smaller then register size) packed in DstTy
3950   // (larger then register size) and since merge was not combined it will be
3951   // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
3952   // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
3953 
3954   // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
3955   //
3956   // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
3957   // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
3958   // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
3959   SmallVector<Register, 8> NarrowTyElts;
3960   unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3961   unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
3962   unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
3963   for (unsigned i = 0; i < NumParts; ++i) {
3964     SmallVector<Register, 8> Sources;
3965     for (unsigned j = 0; j < NumElts; ++j)
3966       Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
3967     NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Sources).getReg(0));
3968   }
3969 
3970   MIRBuilder.buildMerge(DstReg, NarrowTyElts);
3971   MI.eraseFromParent();
3972   return Legalized;
3973 }
3974 
3975 LegalizerHelper::LegalizeResult
3976 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
3977                                                            unsigned TypeIdx,
3978                                                            LLT NarrowVecTy) {
3979   Register DstReg = MI.getOperand(0).getReg();
3980   Register SrcVec = MI.getOperand(1).getReg();
3981   Register InsertVal;
3982   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
3983 
3984   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
3985   if (IsInsert)
3986     InsertVal = MI.getOperand(2).getReg();
3987 
3988   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
3989 
3990   // TODO: Handle total scalarization case.
3991   if (!NarrowVecTy.isVector())
3992     return UnableToLegalize;
3993 
3994   LLT VecTy = MRI.getType(SrcVec);
3995 
3996   // If the index is a constant, we can really break this down as you would
3997   // expect, and index into the target size pieces.
3998   int64_t IdxVal;
3999   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
4000   if (MaybeCst) {
4001     IdxVal = MaybeCst->Value.getSExtValue();
4002     // Avoid out of bounds indexing the pieces.
4003     if (IdxVal >= VecTy.getNumElements()) {
4004       MIRBuilder.buildUndef(DstReg);
4005       MI.eraseFromParent();
4006       return Legalized;
4007     }
4008 
4009     SmallVector<Register, 8> VecParts;
4010     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4011 
4012     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4013     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4014                                     TargetOpcode::G_ANYEXT);
4015 
4016     unsigned NewNumElts = NarrowVecTy.getNumElements();
4017 
4018     LLT IdxTy = MRI.getType(Idx);
4019     int64_t PartIdx = IdxVal / NewNumElts;
4020     auto NewIdx =
4021         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4022 
4023     if (IsInsert) {
4024       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4025 
4026       // Use the adjusted index to insert into one of the subvectors.
4027       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4028           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4029       VecParts[PartIdx] = InsertPart.getReg(0);
4030 
4031       // Recombine the inserted subvector with the others to reform the result
4032       // vector.
4033       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4034     } else {
4035       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4036     }
4037 
4038     MI.eraseFromParent();
4039     return Legalized;
4040   }
4041 
4042   // With a variable index, we can't perform the operation in a smaller type, so
4043   // we're forced to expand this.
4044   //
4045   // TODO: We could emit a chain of compare/select to figure out which piece to
4046   // index.
4047   return lowerExtractInsertVectorElt(MI);
4048 }
4049 
4050 LegalizerHelper::LegalizeResult
4051 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4052                                       LLT NarrowTy) {
4053   // FIXME: Don't know how to handle secondary types yet.
4054   if (TypeIdx != 0)
4055     return UnableToLegalize;
4056 
4057   // This implementation doesn't work for atomics. Give up instead of doing
4058   // something invalid.
4059   if (LdStMI.isAtomic())
4060     return UnableToLegalize;
4061 
4062   bool IsLoad = isa<GLoad>(LdStMI);
4063   Register ValReg = LdStMI.getReg(0);
4064   Register AddrReg = LdStMI.getPointerReg();
4065   LLT ValTy = MRI.getType(ValReg);
4066 
4067   // FIXME: Do we need a distinct NarrowMemory legalize action?
4068   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4069     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4070     return UnableToLegalize;
4071   }
4072 
4073   int NumParts = -1;
4074   int NumLeftover = -1;
4075   LLT LeftoverTy;
4076   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4077   if (IsLoad) {
4078     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4079   } else {
4080     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4081                      NarrowLeftoverRegs)) {
4082       NumParts = NarrowRegs.size();
4083       NumLeftover = NarrowLeftoverRegs.size();
4084     }
4085   }
4086 
4087   if (NumParts == -1)
4088     return UnableToLegalize;
4089 
4090   LLT PtrTy = MRI.getType(AddrReg);
4091   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4092 
4093   unsigned TotalSize = ValTy.getSizeInBits();
4094 
4095   // Split the load/store into PartTy sized pieces starting at Offset. If this
4096   // is a load, return the new registers in ValRegs. For a store, each elements
4097   // of ValRegs should be PartTy. Returns the next offset that needs to be
4098   // handled.
4099   bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
4100   auto MMO = LdStMI.getMMO();
4101   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4102                              unsigned NumParts, unsigned Offset) -> unsigned {
4103     MachineFunction &MF = MIRBuilder.getMF();
4104     unsigned PartSize = PartTy.getSizeInBits();
4105     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4106          ++Idx) {
4107       unsigned ByteOffset = Offset / 8;
4108       Register NewAddrReg;
4109 
4110       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4111 
4112       MachineMemOperand *NewMMO =
4113           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4114 
4115       if (IsLoad) {
4116         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4117         ValRegs.push_back(Dst);
4118         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4119       } else {
4120         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4121       }
4122       Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
4123     }
4124 
4125     return Offset;
4126   };
4127 
4128   unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
4129   unsigned HandledOffset =
4130       splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
4131 
4132   // Handle the rest of the register if this isn't an even type breakdown.
4133   if (LeftoverTy.isValid())
4134     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
4135 
4136   if (IsLoad) {
4137     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4138                 LeftoverTy, NarrowLeftoverRegs);
4139   }
4140 
4141   LdStMI.eraseFromParent();
4142   return Legalized;
4143 }
4144 
4145 LegalizerHelper::LegalizeResult
4146 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4147                                      LLT NarrowTy) {
4148   using namespace TargetOpcode;
4149   GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
4150   unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
4151 
4152   switch (MI.getOpcode()) {
4153   case G_IMPLICIT_DEF:
4154   case G_TRUNC:
4155   case G_AND:
4156   case G_OR:
4157   case G_XOR:
4158   case G_ADD:
4159   case G_SUB:
4160   case G_MUL:
4161   case G_PTR_ADD:
4162   case G_SMULH:
4163   case G_UMULH:
4164   case G_FADD:
4165   case G_FMUL:
4166   case G_FSUB:
4167   case G_FNEG:
4168   case G_FABS:
4169   case G_FCANONICALIZE:
4170   case G_FDIV:
4171   case G_FREM:
4172   case G_FMA:
4173   case G_FMAD:
4174   case G_FPOW:
4175   case G_FEXP:
4176   case G_FEXP2:
4177   case G_FLOG:
4178   case G_FLOG2:
4179   case G_FLOG10:
4180   case G_FNEARBYINT:
4181   case G_FCEIL:
4182   case G_FFLOOR:
4183   case G_FRINT:
4184   case G_INTRINSIC_ROUND:
4185   case G_INTRINSIC_ROUNDEVEN:
4186   case G_INTRINSIC_TRUNC:
4187   case G_FCOS:
4188   case G_FSIN:
4189   case G_FSQRT:
4190   case G_BSWAP:
4191   case G_BITREVERSE:
4192   case G_SDIV:
4193   case G_UDIV:
4194   case G_SREM:
4195   case G_UREM:
4196   case G_SDIVREM:
4197   case G_UDIVREM:
4198   case G_SMIN:
4199   case G_SMAX:
4200   case G_UMIN:
4201   case G_UMAX:
4202   case G_ABS:
4203   case G_FMINNUM:
4204   case G_FMAXNUM:
4205   case G_FMINNUM_IEEE:
4206   case G_FMAXNUM_IEEE:
4207   case G_FMINIMUM:
4208   case G_FMAXIMUM:
4209   case G_FSHL:
4210   case G_FSHR:
4211   case G_ROTL:
4212   case G_ROTR:
4213   case G_FREEZE:
4214   case G_SADDSAT:
4215   case G_SSUBSAT:
4216   case G_UADDSAT:
4217   case G_USUBSAT:
4218   case G_UMULO:
4219   case G_SMULO:
4220   case G_SHL:
4221   case G_LSHR:
4222   case G_ASHR:
4223   case G_SSHLSAT:
4224   case G_USHLSAT:
4225   case G_CTLZ:
4226   case G_CTLZ_ZERO_UNDEF:
4227   case G_CTTZ:
4228   case G_CTTZ_ZERO_UNDEF:
4229   case G_CTPOP:
4230   case G_FCOPYSIGN:
4231   case G_ZEXT:
4232   case G_SEXT:
4233   case G_ANYEXT:
4234   case G_FPEXT:
4235   case G_FPTRUNC:
4236   case G_SITOFP:
4237   case G_UITOFP:
4238   case G_FPTOSI:
4239   case G_FPTOUI:
4240   case G_INTTOPTR:
4241   case G_PTRTOINT:
4242   case G_ADDRSPACE_CAST:
4243     return fewerElementsVectorMultiEltType(GMI, NumElts);
4244   case G_ICMP:
4245   case G_FCMP:
4246     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
4247   case G_SELECT:
4248     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4249       return fewerElementsVectorMultiEltType(GMI, NumElts);
4250     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
4251   case G_PHI:
4252     return fewerElementsVectorPhi(GMI, NumElts);
4253   case G_UNMERGE_VALUES:
4254     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4255   case G_BUILD_VECTOR:
4256     assert(TypeIdx == 0 && "not a vector type index");
4257     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4258   case G_CONCAT_VECTORS:
4259     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4260       return UnableToLegalize;
4261     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4262   case G_EXTRACT_VECTOR_ELT:
4263   case G_INSERT_VECTOR_ELT:
4264     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4265   case G_LOAD:
4266   case G_STORE:
4267     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4268   case G_SEXT_INREG:
4269     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
4270   GISEL_VECREDUCE_CASES_NONSEQ
4271     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4272   case G_SHUFFLE_VECTOR:
4273     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4274   default:
4275     return UnableToLegalize;
4276   }
4277 }
4278 
4279 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4280     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4281   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4282   if (TypeIdx != 0)
4283     return UnableToLegalize;
4284 
4285   Register DstReg = MI.getOperand(0).getReg();
4286   Register Src1Reg = MI.getOperand(1).getReg();
4287   Register Src2Reg = MI.getOperand(2).getReg();
4288   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4289   LLT DstTy = MRI.getType(DstReg);
4290   LLT Src1Ty = MRI.getType(Src1Reg);
4291   LLT Src2Ty = MRI.getType(Src2Reg);
4292   // The shuffle should be canonicalized by now.
4293   if (DstTy != Src1Ty)
4294     return UnableToLegalize;
4295   if (DstTy != Src2Ty)
4296     return UnableToLegalize;
4297 
4298   if (!isPowerOf2_32(DstTy.getNumElements()))
4299     return UnableToLegalize;
4300 
4301   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4302   // Further legalization attempts will be needed to do split further.
4303   NarrowTy =
4304       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4305   unsigned NewElts = NarrowTy.getNumElements();
4306 
4307   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4308   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4309   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4310   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4311                         SplitSrc2Regs[1]};
4312 
4313   Register Hi, Lo;
4314 
4315   // If Lo or Hi uses elements from at most two of the four input vectors, then
4316   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4317   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4318   SmallVector<int, 16> Ops;
4319   for (unsigned High = 0; High < 2; ++High) {
4320     Register &Output = High ? Hi : Lo;
4321 
4322     // Build a shuffle mask for the output, discovering on the fly which
4323     // input vectors to use as shuffle operands (recorded in InputUsed).
4324     // If building a suitable shuffle vector proves too hard, then bail
4325     // out with useBuildVector set.
4326     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4327     unsigned FirstMaskIdx = High * NewElts;
4328     bool UseBuildVector = false;
4329     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4330       // The mask element.  This indexes into the input.
4331       int Idx = Mask[FirstMaskIdx + MaskOffset];
4332 
4333       // The input vector this mask element indexes into.
4334       unsigned Input = (unsigned)Idx / NewElts;
4335 
4336       if (Input >= array_lengthof(Inputs)) {
4337         // The mask element does not index into any input vector.
4338         Ops.push_back(-1);
4339         continue;
4340       }
4341 
4342       // Turn the index into an offset from the start of the input vector.
4343       Idx -= Input * NewElts;
4344 
4345       // Find or create a shuffle vector operand to hold this input.
4346       unsigned OpNo;
4347       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4348         if (InputUsed[OpNo] == Input) {
4349           // This input vector is already an operand.
4350           break;
4351         } else if (InputUsed[OpNo] == -1U) {
4352           // Create a new operand for this input vector.
4353           InputUsed[OpNo] = Input;
4354           break;
4355         }
4356       }
4357 
4358       if (OpNo >= array_lengthof(InputUsed)) {
4359         // More than two input vectors used!  Give up on trying to create a
4360         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4361         UseBuildVector = true;
4362         break;
4363       }
4364 
4365       // Add the mask index for the new shuffle vector.
4366       Ops.push_back(Idx + OpNo * NewElts);
4367     }
4368 
4369     if (UseBuildVector) {
4370       LLT EltTy = NarrowTy.getElementType();
4371       SmallVector<Register, 16> SVOps;
4372 
4373       // Extract the input elements by hand.
4374       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4375         // The mask element.  This indexes into the input.
4376         int Idx = Mask[FirstMaskIdx + MaskOffset];
4377 
4378         // The input vector this mask element indexes into.
4379         unsigned Input = (unsigned)Idx / NewElts;
4380 
4381         if (Input >= array_lengthof(Inputs)) {
4382           // The mask element is "undef" or indexes off the end of the input.
4383           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4384           continue;
4385         }
4386 
4387         // Turn the index into an offset from the start of the input vector.
4388         Idx -= Input * NewElts;
4389 
4390         // Extract the vector element by hand.
4391         SVOps.push_back(MIRBuilder
4392                             .buildExtractVectorElement(
4393                                 EltTy, Inputs[Input],
4394                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4395                             .getReg(0));
4396       }
4397 
4398       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4399       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4400     } else if (InputUsed[0] == -1U) {
4401       // No input vectors were used! The result is undefined.
4402       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4403     } else {
4404       Register Op0 = Inputs[InputUsed[0]];
4405       // If only one input was used, use an undefined vector for the other.
4406       Register Op1 = InputUsed[1] == -1U
4407                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4408                          : Inputs[InputUsed[1]];
4409       // At least one input vector was used. Create a new shuffle vector.
4410       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4411     }
4412 
4413     Ops.clear();
4414   }
4415 
4416   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4417   MI.eraseFromParent();
4418   return Legalized;
4419 }
4420 
4421 static unsigned getScalarOpcForReduction(unsigned Opc) {
4422   unsigned ScalarOpc;
4423   switch (Opc) {
4424   case TargetOpcode::G_VECREDUCE_FADD:
4425     ScalarOpc = TargetOpcode::G_FADD;
4426     break;
4427   case TargetOpcode::G_VECREDUCE_FMUL:
4428     ScalarOpc = TargetOpcode::G_FMUL;
4429     break;
4430   case TargetOpcode::G_VECREDUCE_FMAX:
4431     ScalarOpc = TargetOpcode::G_FMAXNUM;
4432     break;
4433   case TargetOpcode::G_VECREDUCE_FMIN:
4434     ScalarOpc = TargetOpcode::G_FMINNUM;
4435     break;
4436   case TargetOpcode::G_VECREDUCE_ADD:
4437     ScalarOpc = TargetOpcode::G_ADD;
4438     break;
4439   case TargetOpcode::G_VECREDUCE_MUL:
4440     ScalarOpc = TargetOpcode::G_MUL;
4441     break;
4442   case TargetOpcode::G_VECREDUCE_AND:
4443     ScalarOpc = TargetOpcode::G_AND;
4444     break;
4445   case TargetOpcode::G_VECREDUCE_OR:
4446     ScalarOpc = TargetOpcode::G_OR;
4447     break;
4448   case TargetOpcode::G_VECREDUCE_XOR:
4449     ScalarOpc = TargetOpcode::G_XOR;
4450     break;
4451   case TargetOpcode::G_VECREDUCE_SMAX:
4452     ScalarOpc = TargetOpcode::G_SMAX;
4453     break;
4454   case TargetOpcode::G_VECREDUCE_SMIN:
4455     ScalarOpc = TargetOpcode::G_SMIN;
4456     break;
4457   case TargetOpcode::G_VECREDUCE_UMAX:
4458     ScalarOpc = TargetOpcode::G_UMAX;
4459     break;
4460   case TargetOpcode::G_VECREDUCE_UMIN:
4461     ScalarOpc = TargetOpcode::G_UMIN;
4462     break;
4463   default:
4464     llvm_unreachable("Unhandled reduction");
4465   }
4466   return ScalarOpc;
4467 }
4468 
4469 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4470     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4471   unsigned Opc = MI.getOpcode();
4472   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4473          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4474          "Sequential reductions not expected");
4475 
4476   if (TypeIdx != 1)
4477     return UnableToLegalize;
4478 
4479   // The semantics of the normal non-sequential reductions allow us to freely
4480   // re-associate the operation.
4481   Register SrcReg = MI.getOperand(1).getReg();
4482   LLT SrcTy = MRI.getType(SrcReg);
4483   Register DstReg = MI.getOperand(0).getReg();
4484   LLT DstTy = MRI.getType(DstReg);
4485 
4486   if (NarrowTy.isVector() &&
4487       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4488     return UnableToLegalize;
4489 
4490   unsigned ScalarOpc = getScalarOpcForReduction(Opc);
4491   SmallVector<Register> SplitSrcs;
4492   // If NarrowTy is a scalar then we're being asked to scalarize.
4493   const unsigned NumParts =
4494       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4495                           : SrcTy.getNumElements();
4496 
4497   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4498   if (NarrowTy.isScalar()) {
4499     if (DstTy != NarrowTy)
4500       return UnableToLegalize; // FIXME: handle implicit extensions.
4501 
4502     if (isPowerOf2_32(NumParts)) {
4503       // Generate a tree of scalar operations to reduce the critical path.
4504       SmallVector<Register> PartialResults;
4505       unsigned NumPartsLeft = NumParts;
4506       while (NumPartsLeft > 1) {
4507         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4508           PartialResults.emplace_back(
4509               MIRBuilder
4510                   .buildInstr(ScalarOpc, {NarrowTy},
4511                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4512                   .getReg(0));
4513         }
4514         SplitSrcs = PartialResults;
4515         PartialResults.clear();
4516         NumPartsLeft = SplitSrcs.size();
4517       }
4518       assert(SplitSrcs.size() == 1);
4519       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4520       MI.eraseFromParent();
4521       return Legalized;
4522     }
4523     // If we can't generate a tree, then just do sequential operations.
4524     Register Acc = SplitSrcs[0];
4525     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4526       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4527                 .getReg(0);
4528     MIRBuilder.buildCopy(DstReg, Acc);
4529     MI.eraseFromParent();
4530     return Legalized;
4531   }
4532   SmallVector<Register> PartialReductions;
4533   for (unsigned Part = 0; Part < NumParts; ++Part) {
4534     PartialReductions.push_back(
4535         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4536   }
4537 
4538 
4539   // If the types involved are powers of 2, we can generate intermediate vector
4540   // ops, before generating a final reduction operation.
4541   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4542       isPowerOf2_32(NarrowTy.getNumElements())) {
4543     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4544   }
4545 
4546   Register Acc = PartialReductions[0];
4547   for (unsigned Part = 1; Part < NumParts; ++Part) {
4548     if (Part == NumParts - 1) {
4549       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4550                             {Acc, PartialReductions[Part]});
4551     } else {
4552       Acc = MIRBuilder
4553                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4554                 .getReg(0);
4555     }
4556   }
4557   MI.eraseFromParent();
4558   return Legalized;
4559 }
4560 
4561 LegalizerHelper::LegalizeResult
4562 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4563                                         LLT SrcTy, LLT NarrowTy,
4564                                         unsigned ScalarOpc) {
4565   SmallVector<Register> SplitSrcs;
4566   // Split the sources into NarrowTy size pieces.
4567   extractParts(SrcReg, NarrowTy,
4568                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4569   // We're going to do a tree reduction using vector operations until we have
4570   // one NarrowTy size value left.
4571   while (SplitSrcs.size() > 1) {
4572     SmallVector<Register> PartialRdxs;
4573     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4574       Register LHS = SplitSrcs[Idx];
4575       Register RHS = SplitSrcs[Idx + 1];
4576       // Create the intermediate vector op.
4577       Register Res =
4578           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4579       PartialRdxs.push_back(Res);
4580     }
4581     SplitSrcs = std::move(PartialRdxs);
4582   }
4583   // Finally generate the requested NarrowTy based reduction.
4584   Observer.changingInstr(MI);
4585   MI.getOperand(1).setReg(SplitSrcs[0]);
4586   Observer.changedInstr(MI);
4587   return Legalized;
4588 }
4589 
4590 LegalizerHelper::LegalizeResult
4591 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4592                                              const LLT HalfTy, const LLT AmtTy) {
4593 
4594   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4595   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4596   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4597 
4598   if (Amt.isZero()) {
4599     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4600     MI.eraseFromParent();
4601     return Legalized;
4602   }
4603 
4604   LLT NVT = HalfTy;
4605   unsigned NVTBits = HalfTy.getSizeInBits();
4606   unsigned VTBits = 2 * NVTBits;
4607 
4608   SrcOp Lo(Register(0)), Hi(Register(0));
4609   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4610     if (Amt.ugt(VTBits)) {
4611       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4612     } else if (Amt.ugt(NVTBits)) {
4613       Lo = MIRBuilder.buildConstant(NVT, 0);
4614       Hi = MIRBuilder.buildShl(NVT, InL,
4615                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4616     } else if (Amt == NVTBits) {
4617       Lo = MIRBuilder.buildConstant(NVT, 0);
4618       Hi = InL;
4619     } else {
4620       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4621       auto OrLHS =
4622           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4623       auto OrRHS = MIRBuilder.buildLShr(
4624           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4625       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4626     }
4627   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4628     if (Amt.ugt(VTBits)) {
4629       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4630     } else if (Amt.ugt(NVTBits)) {
4631       Lo = MIRBuilder.buildLShr(NVT, InH,
4632                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4633       Hi = MIRBuilder.buildConstant(NVT, 0);
4634     } else if (Amt == NVTBits) {
4635       Lo = InH;
4636       Hi = MIRBuilder.buildConstant(NVT, 0);
4637     } else {
4638       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4639 
4640       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4641       auto OrRHS = MIRBuilder.buildShl(
4642           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4643 
4644       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4645       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4646     }
4647   } else {
4648     if (Amt.ugt(VTBits)) {
4649       Hi = Lo = MIRBuilder.buildAShr(
4650           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4651     } else if (Amt.ugt(NVTBits)) {
4652       Lo = MIRBuilder.buildAShr(NVT, InH,
4653                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4654       Hi = MIRBuilder.buildAShr(NVT, InH,
4655                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4656     } else if (Amt == NVTBits) {
4657       Lo = InH;
4658       Hi = MIRBuilder.buildAShr(NVT, InH,
4659                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4660     } else {
4661       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4662 
4663       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4664       auto OrRHS = MIRBuilder.buildShl(
4665           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4666 
4667       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4668       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4669     }
4670   }
4671 
4672   MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4673   MI.eraseFromParent();
4674 
4675   return Legalized;
4676 }
4677 
4678 // TODO: Optimize if constant shift amount.
4679 LegalizerHelper::LegalizeResult
4680 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4681                                    LLT RequestedTy) {
4682   if (TypeIdx == 1) {
4683     Observer.changingInstr(MI);
4684     narrowScalarSrc(MI, RequestedTy, 2);
4685     Observer.changedInstr(MI);
4686     return Legalized;
4687   }
4688 
4689   Register DstReg = MI.getOperand(0).getReg();
4690   LLT DstTy = MRI.getType(DstReg);
4691   if (DstTy.isVector())
4692     return UnableToLegalize;
4693 
4694   Register Amt = MI.getOperand(2).getReg();
4695   LLT ShiftAmtTy = MRI.getType(Amt);
4696   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4697   if (DstEltSize % 2 != 0)
4698     return UnableToLegalize;
4699 
4700   // Ignore the input type. We can only go to exactly half the size of the
4701   // input. If that isn't small enough, the resulting pieces will be further
4702   // legalized.
4703   const unsigned NewBitSize = DstEltSize / 2;
4704   const LLT HalfTy = LLT::scalar(NewBitSize);
4705   const LLT CondTy = LLT::scalar(1);
4706 
4707   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
4708     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4709                                        ShiftAmtTy);
4710   }
4711 
4712   // TODO: Expand with known bits.
4713 
4714   // Handle the fully general expansion by an unknown amount.
4715   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4716 
4717   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4718   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4719   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4720 
4721   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4722   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4723 
4724   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4725   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4726   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4727 
4728   Register ResultRegs[2];
4729   switch (MI.getOpcode()) {
4730   case TargetOpcode::G_SHL: {
4731     // Short: ShAmt < NewBitSize
4732     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4733 
4734     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4735     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4736     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4737 
4738     // Long: ShAmt >= NewBitSize
4739     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4740     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4741 
4742     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4743     auto Hi = MIRBuilder.buildSelect(
4744         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4745 
4746     ResultRegs[0] = Lo.getReg(0);
4747     ResultRegs[1] = Hi.getReg(0);
4748     break;
4749   }
4750   case TargetOpcode::G_LSHR:
4751   case TargetOpcode::G_ASHR: {
4752     // Short: ShAmt < NewBitSize
4753     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4754 
4755     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4756     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4757     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4758 
4759     // Long: ShAmt >= NewBitSize
4760     MachineInstrBuilder HiL;
4761     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4762       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4763     } else {
4764       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4765       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4766     }
4767     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4768                                      {InH, AmtExcess});     // Lo from Hi part.
4769 
4770     auto Lo = MIRBuilder.buildSelect(
4771         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4772 
4773     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4774 
4775     ResultRegs[0] = Lo.getReg(0);
4776     ResultRegs[1] = Hi.getReg(0);
4777     break;
4778   }
4779   default:
4780     llvm_unreachable("not a shift");
4781   }
4782 
4783   MIRBuilder.buildMerge(DstReg, ResultRegs);
4784   MI.eraseFromParent();
4785   return Legalized;
4786 }
4787 
4788 LegalizerHelper::LegalizeResult
4789 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
4790                                        LLT MoreTy) {
4791   assert(TypeIdx == 0 && "Expecting only Idx 0");
4792 
4793   Observer.changingInstr(MI);
4794   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4795     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
4796     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
4797     moreElementsVectorSrc(MI, MoreTy, I);
4798   }
4799 
4800   MachineBasicBlock &MBB = *MI.getParent();
4801   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
4802   moreElementsVectorDst(MI, MoreTy, 0);
4803   Observer.changedInstr(MI);
4804   return Legalized;
4805 }
4806 
4807 LegalizerHelper::LegalizeResult
4808 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
4809                                     LLT MoreTy) {
4810   unsigned Opc = MI.getOpcode();
4811   switch (Opc) {
4812   case TargetOpcode::G_IMPLICIT_DEF:
4813   case TargetOpcode::G_LOAD: {
4814     if (TypeIdx != 0)
4815       return UnableToLegalize;
4816     Observer.changingInstr(MI);
4817     moreElementsVectorDst(MI, MoreTy, 0);
4818     Observer.changedInstr(MI);
4819     return Legalized;
4820   }
4821   case TargetOpcode::G_STORE:
4822     if (TypeIdx != 0)
4823       return UnableToLegalize;
4824     Observer.changingInstr(MI);
4825     moreElementsVectorSrc(MI, MoreTy, 0);
4826     Observer.changedInstr(MI);
4827     return Legalized;
4828   case TargetOpcode::G_AND:
4829   case TargetOpcode::G_OR:
4830   case TargetOpcode::G_XOR:
4831   case TargetOpcode::G_ADD:
4832   case TargetOpcode::G_SUB:
4833   case TargetOpcode::G_MUL:
4834   case TargetOpcode::G_FADD:
4835   case TargetOpcode::G_FMUL:
4836   case TargetOpcode::G_UADDSAT:
4837   case TargetOpcode::G_USUBSAT:
4838   case TargetOpcode::G_SADDSAT:
4839   case TargetOpcode::G_SSUBSAT:
4840   case TargetOpcode::G_SMIN:
4841   case TargetOpcode::G_SMAX:
4842   case TargetOpcode::G_UMIN:
4843   case TargetOpcode::G_UMAX:
4844   case TargetOpcode::G_FMINNUM:
4845   case TargetOpcode::G_FMAXNUM:
4846   case TargetOpcode::G_FMINNUM_IEEE:
4847   case TargetOpcode::G_FMAXNUM_IEEE:
4848   case TargetOpcode::G_FMINIMUM:
4849   case TargetOpcode::G_FMAXIMUM: {
4850     Observer.changingInstr(MI);
4851     moreElementsVectorSrc(MI, MoreTy, 1);
4852     moreElementsVectorSrc(MI, MoreTy, 2);
4853     moreElementsVectorDst(MI, MoreTy, 0);
4854     Observer.changedInstr(MI);
4855     return Legalized;
4856   }
4857   case TargetOpcode::G_FMA:
4858   case TargetOpcode::G_FSHR:
4859   case TargetOpcode::G_FSHL: {
4860     Observer.changingInstr(MI);
4861     moreElementsVectorSrc(MI, MoreTy, 1);
4862     moreElementsVectorSrc(MI, MoreTy, 2);
4863     moreElementsVectorSrc(MI, MoreTy, 3);
4864     moreElementsVectorDst(MI, MoreTy, 0);
4865     Observer.changedInstr(MI);
4866     return Legalized;
4867   }
4868   case TargetOpcode::G_EXTRACT:
4869     if (TypeIdx != 1)
4870       return UnableToLegalize;
4871     Observer.changingInstr(MI);
4872     moreElementsVectorSrc(MI, MoreTy, 1);
4873     Observer.changedInstr(MI);
4874     return Legalized;
4875   case TargetOpcode::G_INSERT:
4876   case TargetOpcode::G_FREEZE:
4877   case TargetOpcode::G_FNEG:
4878   case TargetOpcode::G_FABS:
4879   case TargetOpcode::G_BSWAP:
4880   case TargetOpcode::G_FCANONICALIZE:
4881   case TargetOpcode::G_SEXT_INREG:
4882     if (TypeIdx != 0)
4883       return UnableToLegalize;
4884     Observer.changingInstr(MI);
4885     moreElementsVectorSrc(MI, MoreTy, 1);
4886     moreElementsVectorDst(MI, MoreTy, 0);
4887     Observer.changedInstr(MI);
4888     return Legalized;
4889   case TargetOpcode::G_SELECT:
4890     if (TypeIdx != 0)
4891       return UnableToLegalize;
4892     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4893       return UnableToLegalize;
4894 
4895     Observer.changingInstr(MI);
4896     moreElementsVectorSrc(MI, MoreTy, 2);
4897     moreElementsVectorSrc(MI, MoreTy, 3);
4898     moreElementsVectorDst(MI, MoreTy, 0);
4899     Observer.changedInstr(MI);
4900     return Legalized;
4901   case TargetOpcode::G_UNMERGE_VALUES:
4902     return UnableToLegalize;
4903   case TargetOpcode::G_PHI:
4904     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
4905   case TargetOpcode::G_SHUFFLE_VECTOR:
4906     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
4907   case TargetOpcode::G_BUILD_VECTOR: {
4908     SmallVector<SrcOp, 8> Elts;
4909     for (auto Op : MI.uses()) {
4910       Elts.push_back(Op.getReg());
4911     }
4912 
4913     for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
4914       Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
4915     }
4916 
4917     MIRBuilder.buildDeleteTrailingVectorElements(
4918         MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
4919     MI.eraseFromParent();
4920     return Legalized;
4921   }
4922   case TargetOpcode::G_TRUNC: {
4923     Observer.changingInstr(MI);
4924     moreElementsVectorSrc(MI, MoreTy, 1);
4925     moreElementsVectorDst(MI, MoreTy, 0);
4926     Observer.changedInstr(MI);
4927     return Legalized;
4928   }
4929   default:
4930     return UnableToLegalize;
4931   }
4932 }
4933 
4934 LegalizerHelper::LegalizeResult
4935 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
4936                                            unsigned int TypeIdx, LLT MoreTy) {
4937   if (TypeIdx != 0)
4938     return UnableToLegalize;
4939 
4940   Register DstReg = MI.getOperand(0).getReg();
4941   Register Src1Reg = MI.getOperand(1).getReg();
4942   Register Src2Reg = MI.getOperand(2).getReg();
4943   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4944   LLT DstTy = MRI.getType(DstReg);
4945   LLT Src1Ty = MRI.getType(Src1Reg);
4946   LLT Src2Ty = MRI.getType(Src2Reg);
4947   unsigned NumElts = DstTy.getNumElements();
4948   unsigned WidenNumElts = MoreTy.getNumElements();
4949 
4950   // Expect a canonicalized shuffle.
4951   if (DstTy != Src1Ty || DstTy != Src2Ty)
4952     return UnableToLegalize;
4953 
4954   moreElementsVectorSrc(MI, MoreTy, 1);
4955   moreElementsVectorSrc(MI, MoreTy, 2);
4956 
4957   // Adjust mask based on new input vector length.
4958   SmallVector<int, 16> NewMask;
4959   for (unsigned I = 0; I != NumElts; ++I) {
4960     int Idx = Mask[I];
4961     if (Idx < static_cast<int>(NumElts))
4962       NewMask.push_back(Idx);
4963     else
4964       NewMask.push_back(Idx - NumElts + WidenNumElts);
4965   }
4966   for (unsigned I = NumElts; I != WidenNumElts; ++I)
4967     NewMask.push_back(-1);
4968   moreElementsVectorDst(MI, MoreTy, 0);
4969   MIRBuilder.setInstrAndDebugLoc(MI);
4970   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
4971                                 MI.getOperand(1).getReg(),
4972                                 MI.getOperand(2).getReg(), NewMask);
4973   MI.eraseFromParent();
4974   return Legalized;
4975 }
4976 
4977 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
4978                                         ArrayRef<Register> Src1Regs,
4979                                         ArrayRef<Register> Src2Regs,
4980                                         LLT NarrowTy) {
4981   MachineIRBuilder &B = MIRBuilder;
4982   unsigned SrcParts = Src1Regs.size();
4983   unsigned DstParts = DstRegs.size();
4984 
4985   unsigned DstIdx = 0; // Low bits of the result.
4986   Register FactorSum =
4987       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
4988   DstRegs[DstIdx] = FactorSum;
4989 
4990   unsigned CarrySumPrevDstIdx;
4991   SmallVector<Register, 4> Factors;
4992 
4993   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
4994     // Collect low parts of muls for DstIdx.
4995     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
4996          i <= std::min(DstIdx, SrcParts - 1); ++i) {
4997       MachineInstrBuilder Mul =
4998           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
4999       Factors.push_back(Mul.getReg(0));
5000     }
5001     // Collect high parts of muls from previous DstIdx.
5002     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5003          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5004       MachineInstrBuilder Umulh =
5005           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5006       Factors.push_back(Umulh.getReg(0));
5007     }
5008     // Add CarrySum from additions calculated for previous DstIdx.
5009     if (DstIdx != 1) {
5010       Factors.push_back(CarrySumPrevDstIdx);
5011     }
5012 
5013     Register CarrySum;
5014     // Add all factors and accumulate all carries into CarrySum.
5015     if (DstIdx != DstParts - 1) {
5016       MachineInstrBuilder Uaddo =
5017           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5018       FactorSum = Uaddo.getReg(0);
5019       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5020       for (unsigned i = 2; i < Factors.size(); ++i) {
5021         MachineInstrBuilder Uaddo =
5022             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5023         FactorSum = Uaddo.getReg(0);
5024         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5025         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5026       }
5027     } else {
5028       // Since value for the next index is not calculated, neither is CarrySum.
5029       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5030       for (unsigned i = 2; i < Factors.size(); ++i)
5031         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5032     }
5033 
5034     CarrySumPrevDstIdx = CarrySum;
5035     DstRegs[DstIdx] = FactorSum;
5036     Factors.clear();
5037   }
5038 }
5039 
5040 LegalizerHelper::LegalizeResult
5041 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5042                                     LLT NarrowTy) {
5043   if (TypeIdx != 0)
5044     return UnableToLegalize;
5045 
5046   Register DstReg = MI.getOperand(0).getReg();
5047   LLT DstType = MRI.getType(DstReg);
5048   // FIXME: add support for vector types
5049   if (DstType.isVector())
5050     return UnableToLegalize;
5051 
5052   unsigned Opcode = MI.getOpcode();
5053   unsigned OpO, OpE, OpF;
5054   switch (Opcode) {
5055   case TargetOpcode::G_SADDO:
5056   case TargetOpcode::G_SADDE:
5057   case TargetOpcode::G_UADDO:
5058   case TargetOpcode::G_UADDE:
5059   case TargetOpcode::G_ADD:
5060     OpO = TargetOpcode::G_UADDO;
5061     OpE = TargetOpcode::G_UADDE;
5062     OpF = TargetOpcode::G_UADDE;
5063     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5064       OpF = TargetOpcode::G_SADDE;
5065     break;
5066   case TargetOpcode::G_SSUBO:
5067   case TargetOpcode::G_SSUBE:
5068   case TargetOpcode::G_USUBO:
5069   case TargetOpcode::G_USUBE:
5070   case TargetOpcode::G_SUB:
5071     OpO = TargetOpcode::G_USUBO;
5072     OpE = TargetOpcode::G_USUBE;
5073     OpF = TargetOpcode::G_USUBE;
5074     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5075       OpF = TargetOpcode::G_SSUBE;
5076     break;
5077   default:
5078     llvm_unreachable("Unexpected add/sub opcode!");
5079   }
5080 
5081   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5082   unsigned NumDefs = MI.getNumExplicitDefs();
5083   Register Src1 = MI.getOperand(NumDefs).getReg();
5084   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5085   Register CarryDst, CarryIn;
5086   if (NumDefs == 2)
5087     CarryDst = MI.getOperand(1).getReg();
5088   if (MI.getNumOperands() == NumDefs + 3)
5089     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5090 
5091   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5092   LLT LeftoverTy, DummyTy;
5093   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5094   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5095   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5096 
5097   int NarrowParts = Src1Regs.size();
5098   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5099     Src1Regs.push_back(Src1Left[I]);
5100     Src2Regs.push_back(Src2Left[I]);
5101   }
5102   DstRegs.reserve(Src1Regs.size());
5103 
5104   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5105     Register DstReg =
5106         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5107     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5108     // Forward the final carry-out to the destination register
5109     if (i == e - 1 && CarryDst)
5110       CarryOut = CarryDst;
5111 
5112     if (!CarryIn) {
5113       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5114                             {Src1Regs[i], Src2Regs[i]});
5115     } else if (i == e - 1) {
5116       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5117                             {Src1Regs[i], Src2Regs[i], CarryIn});
5118     } else {
5119       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5120                             {Src1Regs[i], Src2Regs[i], CarryIn});
5121     }
5122 
5123     DstRegs.push_back(DstReg);
5124     CarryIn = CarryOut;
5125   }
5126   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5127               makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5128               makeArrayRef(DstRegs).drop_front(NarrowParts));
5129 
5130   MI.eraseFromParent();
5131   return Legalized;
5132 }
5133 
5134 LegalizerHelper::LegalizeResult
5135 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5136   Register DstReg = MI.getOperand(0).getReg();
5137   Register Src1 = MI.getOperand(1).getReg();
5138   Register Src2 = MI.getOperand(2).getReg();
5139 
5140   LLT Ty = MRI.getType(DstReg);
5141   if (Ty.isVector())
5142     return UnableToLegalize;
5143 
5144   unsigned Size = Ty.getSizeInBits();
5145   unsigned NarrowSize = NarrowTy.getSizeInBits();
5146   if (Size % NarrowSize != 0)
5147     return UnableToLegalize;
5148 
5149   unsigned NumParts = Size / NarrowSize;
5150   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5151   unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5152 
5153   SmallVector<Register, 2> Src1Parts, Src2Parts;
5154   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5155   extractParts(Src1, NarrowTy, NumParts, Src1Parts);
5156   extractParts(Src2, NarrowTy, NumParts, Src2Parts);
5157   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5158 
5159   // Take only high half of registers if this is high mul.
5160   ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5161   MIRBuilder.buildMerge(DstReg, DstRegs);
5162   MI.eraseFromParent();
5163   return Legalized;
5164 }
5165 
5166 LegalizerHelper::LegalizeResult
5167 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5168                                    LLT NarrowTy) {
5169   if (TypeIdx != 0)
5170     return UnableToLegalize;
5171 
5172   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5173 
5174   Register Src = MI.getOperand(1).getReg();
5175   LLT SrcTy = MRI.getType(Src);
5176 
5177   // If all finite floats fit into the narrowed integer type, we can just swap
5178   // out the result type. This is practically only useful for conversions from
5179   // half to at least 16-bits, so just handle the one case.
5180   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5181       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5182     return UnableToLegalize;
5183 
5184   Observer.changingInstr(MI);
5185   narrowScalarDst(MI, NarrowTy, 0,
5186                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5187   Observer.changedInstr(MI);
5188   return Legalized;
5189 }
5190 
5191 LegalizerHelper::LegalizeResult
5192 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5193                                      LLT NarrowTy) {
5194   if (TypeIdx != 1)
5195     return UnableToLegalize;
5196 
5197   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5198 
5199   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5200   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5201   // NarrowSize.
5202   if (SizeOp1 % NarrowSize != 0)
5203     return UnableToLegalize;
5204   int NumParts = SizeOp1 / NarrowSize;
5205 
5206   SmallVector<Register, 2> SrcRegs, DstRegs;
5207   SmallVector<uint64_t, 2> Indexes;
5208   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5209 
5210   Register OpReg = MI.getOperand(0).getReg();
5211   uint64_t OpStart = MI.getOperand(2).getImm();
5212   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5213   for (int i = 0; i < NumParts; ++i) {
5214     unsigned SrcStart = i * NarrowSize;
5215 
5216     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5217       // No part of the extract uses this subregister, ignore it.
5218       continue;
5219     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5220       // The entire subregister is extracted, forward the value.
5221       DstRegs.push_back(SrcRegs[i]);
5222       continue;
5223     }
5224 
5225     // OpSegStart is where this destination segment would start in OpReg if it
5226     // extended infinitely in both directions.
5227     int64_t ExtractOffset;
5228     uint64_t SegSize;
5229     if (OpStart < SrcStart) {
5230       ExtractOffset = 0;
5231       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5232     } else {
5233       ExtractOffset = OpStart - SrcStart;
5234       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5235     }
5236 
5237     Register SegReg = SrcRegs[i];
5238     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5239       // A genuine extract is needed.
5240       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5241       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5242     }
5243 
5244     DstRegs.push_back(SegReg);
5245   }
5246 
5247   Register DstReg = MI.getOperand(0).getReg();
5248   if (MRI.getType(DstReg).isVector())
5249     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5250   else if (DstRegs.size() > 1)
5251     MIRBuilder.buildMerge(DstReg, DstRegs);
5252   else
5253     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5254   MI.eraseFromParent();
5255   return Legalized;
5256 }
5257 
5258 LegalizerHelper::LegalizeResult
5259 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5260                                     LLT NarrowTy) {
5261   // FIXME: Don't know how to handle secondary types yet.
5262   if (TypeIdx != 0)
5263     return UnableToLegalize;
5264 
5265   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5266   SmallVector<uint64_t, 2> Indexes;
5267   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5268   LLT LeftoverTy;
5269   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5270                LeftoverRegs);
5271 
5272   for (Register Reg : LeftoverRegs)
5273     SrcRegs.push_back(Reg);
5274 
5275   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5276   Register OpReg = MI.getOperand(2).getReg();
5277   uint64_t OpStart = MI.getOperand(3).getImm();
5278   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5279   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5280     unsigned DstStart = I * NarrowSize;
5281 
5282     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5283       // The entire subregister is defined by this insert, forward the new
5284       // value.
5285       DstRegs.push_back(OpReg);
5286       continue;
5287     }
5288 
5289     Register SrcReg = SrcRegs[I];
5290     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5291       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5292       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5293       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5294     }
5295 
5296     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5297       // No part of the insert affects this subregister, forward the original.
5298       DstRegs.push_back(SrcReg);
5299       continue;
5300     }
5301 
5302     // OpSegStart is where this destination segment would start in OpReg if it
5303     // extended infinitely in both directions.
5304     int64_t ExtractOffset, InsertOffset;
5305     uint64_t SegSize;
5306     if (OpStart < DstStart) {
5307       InsertOffset = 0;
5308       ExtractOffset = DstStart - OpStart;
5309       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5310     } else {
5311       InsertOffset = OpStart - DstStart;
5312       ExtractOffset = 0;
5313       SegSize =
5314         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5315     }
5316 
5317     Register SegReg = OpReg;
5318     if (ExtractOffset != 0 || SegSize != OpSize) {
5319       // A genuine extract is needed.
5320       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5321       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5322     }
5323 
5324     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5325     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5326     DstRegs.push_back(DstReg);
5327   }
5328 
5329   uint64_t WideSize = DstRegs.size() * NarrowSize;
5330   Register DstReg = MI.getOperand(0).getReg();
5331   if (WideSize > RegTy.getSizeInBits()) {
5332     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5333     MIRBuilder.buildMerge(MergeReg, DstRegs);
5334     MIRBuilder.buildTrunc(DstReg, MergeReg);
5335   } else
5336     MIRBuilder.buildMerge(DstReg, DstRegs);
5337 
5338   MI.eraseFromParent();
5339   return Legalized;
5340 }
5341 
5342 LegalizerHelper::LegalizeResult
5343 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5344                                    LLT NarrowTy) {
5345   Register DstReg = MI.getOperand(0).getReg();
5346   LLT DstTy = MRI.getType(DstReg);
5347 
5348   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5349 
5350   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5351   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5352   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5353   LLT LeftoverTy;
5354   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5355                     Src0Regs, Src0LeftoverRegs))
5356     return UnableToLegalize;
5357 
5358   LLT Unused;
5359   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5360                     Src1Regs, Src1LeftoverRegs))
5361     llvm_unreachable("inconsistent extractParts result");
5362 
5363   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5364     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5365                                         {Src0Regs[I], Src1Regs[I]});
5366     DstRegs.push_back(Inst.getReg(0));
5367   }
5368 
5369   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5370     auto Inst = MIRBuilder.buildInstr(
5371       MI.getOpcode(),
5372       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5373     DstLeftoverRegs.push_back(Inst.getReg(0));
5374   }
5375 
5376   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5377               LeftoverTy, DstLeftoverRegs);
5378 
5379   MI.eraseFromParent();
5380   return Legalized;
5381 }
5382 
5383 LegalizerHelper::LegalizeResult
5384 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5385                                  LLT NarrowTy) {
5386   if (TypeIdx != 0)
5387     return UnableToLegalize;
5388 
5389   Register DstReg = MI.getOperand(0).getReg();
5390   Register SrcReg = MI.getOperand(1).getReg();
5391 
5392   LLT DstTy = MRI.getType(DstReg);
5393   if (DstTy.isVector())
5394     return UnableToLegalize;
5395 
5396   SmallVector<Register, 8> Parts;
5397   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5398   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5399   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5400 
5401   MI.eraseFromParent();
5402   return Legalized;
5403 }
5404 
5405 LegalizerHelper::LegalizeResult
5406 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5407                                     LLT NarrowTy) {
5408   if (TypeIdx != 0)
5409     return UnableToLegalize;
5410 
5411   Register CondReg = MI.getOperand(1).getReg();
5412   LLT CondTy = MRI.getType(CondReg);
5413   if (CondTy.isVector()) // TODO: Handle vselect
5414     return UnableToLegalize;
5415 
5416   Register DstReg = MI.getOperand(0).getReg();
5417   LLT DstTy = MRI.getType(DstReg);
5418 
5419   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5420   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5421   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5422   LLT LeftoverTy;
5423   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5424                     Src1Regs, Src1LeftoverRegs))
5425     return UnableToLegalize;
5426 
5427   LLT Unused;
5428   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5429                     Src2Regs, Src2LeftoverRegs))
5430     llvm_unreachable("inconsistent extractParts result");
5431 
5432   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5433     auto Select = MIRBuilder.buildSelect(NarrowTy,
5434                                          CondReg, Src1Regs[I], Src2Regs[I]);
5435     DstRegs.push_back(Select.getReg(0));
5436   }
5437 
5438   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5439     auto Select = MIRBuilder.buildSelect(
5440       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5441     DstLeftoverRegs.push_back(Select.getReg(0));
5442   }
5443 
5444   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5445               LeftoverTy, DstLeftoverRegs);
5446 
5447   MI.eraseFromParent();
5448   return Legalized;
5449 }
5450 
5451 LegalizerHelper::LegalizeResult
5452 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5453                                   LLT NarrowTy) {
5454   if (TypeIdx != 1)
5455     return UnableToLegalize;
5456 
5457   Register DstReg = MI.getOperand(0).getReg();
5458   Register SrcReg = MI.getOperand(1).getReg();
5459   LLT DstTy = MRI.getType(DstReg);
5460   LLT SrcTy = MRI.getType(SrcReg);
5461   unsigned NarrowSize = NarrowTy.getSizeInBits();
5462 
5463   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5464     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5465 
5466     MachineIRBuilder &B = MIRBuilder;
5467     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5468     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5469     auto C_0 = B.buildConstant(NarrowTy, 0);
5470     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5471                                 UnmergeSrc.getReg(1), C_0);
5472     auto LoCTLZ = IsUndef ?
5473       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5474       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5475     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5476     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5477     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5478     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5479 
5480     MI.eraseFromParent();
5481     return Legalized;
5482   }
5483 
5484   return UnableToLegalize;
5485 }
5486 
5487 LegalizerHelper::LegalizeResult
5488 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5489                                   LLT NarrowTy) {
5490   if (TypeIdx != 1)
5491     return UnableToLegalize;
5492 
5493   Register DstReg = MI.getOperand(0).getReg();
5494   Register SrcReg = MI.getOperand(1).getReg();
5495   LLT DstTy = MRI.getType(DstReg);
5496   LLT SrcTy = MRI.getType(SrcReg);
5497   unsigned NarrowSize = NarrowTy.getSizeInBits();
5498 
5499   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5500     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5501 
5502     MachineIRBuilder &B = MIRBuilder;
5503     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5504     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5505     auto C_0 = B.buildConstant(NarrowTy, 0);
5506     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5507                                 UnmergeSrc.getReg(0), C_0);
5508     auto HiCTTZ = IsUndef ?
5509       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5510       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5511     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5512     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5513     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5514     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5515 
5516     MI.eraseFromParent();
5517     return Legalized;
5518   }
5519 
5520   return UnableToLegalize;
5521 }
5522 
5523 LegalizerHelper::LegalizeResult
5524 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5525                                    LLT NarrowTy) {
5526   if (TypeIdx != 1)
5527     return UnableToLegalize;
5528 
5529   Register DstReg = MI.getOperand(0).getReg();
5530   LLT DstTy = MRI.getType(DstReg);
5531   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5532   unsigned NarrowSize = NarrowTy.getSizeInBits();
5533 
5534   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5535     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5536 
5537     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5538     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5539     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5540 
5541     MI.eraseFromParent();
5542     return Legalized;
5543   }
5544 
5545   return UnableToLegalize;
5546 }
5547 
5548 LegalizerHelper::LegalizeResult
5549 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5550   unsigned Opc = MI.getOpcode();
5551   const auto &TII = MIRBuilder.getTII();
5552   auto isSupported = [this](const LegalityQuery &Q) {
5553     auto QAction = LI.getAction(Q).Action;
5554     return QAction == Legal || QAction == Libcall || QAction == Custom;
5555   };
5556   switch (Opc) {
5557   default:
5558     return UnableToLegalize;
5559   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5560     // This trivially expands to CTLZ.
5561     Observer.changingInstr(MI);
5562     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5563     Observer.changedInstr(MI);
5564     return Legalized;
5565   }
5566   case TargetOpcode::G_CTLZ: {
5567     Register DstReg = MI.getOperand(0).getReg();
5568     Register SrcReg = MI.getOperand(1).getReg();
5569     LLT DstTy = MRI.getType(DstReg);
5570     LLT SrcTy = MRI.getType(SrcReg);
5571     unsigned Len = SrcTy.getSizeInBits();
5572 
5573     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5574       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5575       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5576       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5577       auto ICmp = MIRBuilder.buildICmp(
5578           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5579       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5580       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5581       MI.eraseFromParent();
5582       return Legalized;
5583     }
5584     // for now, we do this:
5585     // NewLen = NextPowerOf2(Len);
5586     // x = x | (x >> 1);
5587     // x = x | (x >> 2);
5588     // ...
5589     // x = x | (x >>16);
5590     // x = x | (x >>32); // for 64-bit input
5591     // Upto NewLen/2
5592     // return Len - popcount(x);
5593     //
5594     // Ref: "Hacker's Delight" by Henry Warren
5595     Register Op = SrcReg;
5596     unsigned NewLen = PowerOf2Ceil(Len);
5597     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5598       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5599       auto MIBOp = MIRBuilder.buildOr(
5600           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5601       Op = MIBOp.getReg(0);
5602     }
5603     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5604     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5605                         MIBPop);
5606     MI.eraseFromParent();
5607     return Legalized;
5608   }
5609   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5610     // This trivially expands to CTTZ.
5611     Observer.changingInstr(MI);
5612     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5613     Observer.changedInstr(MI);
5614     return Legalized;
5615   }
5616   case TargetOpcode::G_CTTZ: {
5617     Register DstReg = MI.getOperand(0).getReg();
5618     Register SrcReg = MI.getOperand(1).getReg();
5619     LLT DstTy = MRI.getType(DstReg);
5620     LLT SrcTy = MRI.getType(SrcReg);
5621 
5622     unsigned Len = SrcTy.getSizeInBits();
5623     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5624       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5625       // zero.
5626       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5627       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5628       auto ICmp = MIRBuilder.buildICmp(
5629           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5630       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5631       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5632       MI.eraseFromParent();
5633       return Legalized;
5634     }
5635     // for now, we use: { return popcount(~x & (x - 1)); }
5636     // unless the target has ctlz but not ctpop, in which case we use:
5637     // { return 32 - nlz(~x & (x-1)); }
5638     // Ref: "Hacker's Delight" by Henry Warren
5639     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5640     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5641     auto MIBTmp = MIRBuilder.buildAnd(
5642         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5643     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5644         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5645       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5646       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5647                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5648       MI.eraseFromParent();
5649       return Legalized;
5650     }
5651     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5652     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5653     return Legalized;
5654   }
5655   case TargetOpcode::G_CTPOP: {
5656     Register SrcReg = MI.getOperand(1).getReg();
5657     LLT Ty = MRI.getType(SrcReg);
5658     unsigned Size = Ty.getSizeInBits();
5659     MachineIRBuilder &B = MIRBuilder;
5660 
5661     // Count set bits in blocks of 2 bits. Default approach would be
5662     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5663     // We use following formula instead:
5664     // B2Count = val - { (val >> 1) & 0x55555555 }
5665     // since it gives same result in blocks of 2 with one instruction less.
5666     auto C_1 = B.buildConstant(Ty, 1);
5667     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5668     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5669     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5670     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5671     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5672 
5673     // In order to get count in blocks of 4 add values from adjacent block of 2.
5674     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5675     auto C_2 = B.buildConstant(Ty, 2);
5676     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5677     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5678     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5679     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5680     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5681     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5682 
5683     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5684     // addition since count value sits in range {0,...,8} and 4 bits are enough
5685     // to hold such binary values. After addition high 4 bits still hold count
5686     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5687     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5688     auto C_4 = B.buildConstant(Ty, 4);
5689     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5690     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5691     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5692     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5693     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5694 
5695     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5696     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5697     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5698     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5699     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5700 
5701     // Shift count result from 8 high bits to low bits.
5702     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5703     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5704 
5705     MI.eraseFromParent();
5706     return Legalized;
5707   }
5708   }
5709 }
5710 
5711 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5712 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5713                                         Register Reg, unsigned BW) {
5714   return matchUnaryPredicate(
5715       MRI, Reg,
5716       [=](const Constant *C) {
5717         // Null constant here means an undef.
5718         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5719         return !CI || CI->getValue().urem(BW) != 0;
5720       },
5721       /*AllowUndefs*/ true);
5722 }
5723 
5724 LegalizerHelper::LegalizeResult
5725 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5726   Register Dst = MI.getOperand(0).getReg();
5727   Register X = MI.getOperand(1).getReg();
5728   Register Y = MI.getOperand(2).getReg();
5729   Register Z = MI.getOperand(3).getReg();
5730   LLT Ty = MRI.getType(Dst);
5731   LLT ShTy = MRI.getType(Z);
5732 
5733   unsigned BW = Ty.getScalarSizeInBits();
5734 
5735   if (!isPowerOf2_32(BW))
5736     return UnableToLegalize;
5737 
5738   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5739   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5740 
5741   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5742     // fshl X, Y, Z -> fshr X, Y, -Z
5743     // fshr X, Y, Z -> fshl X, Y, -Z
5744     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5745     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5746   } else {
5747     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5748     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5749     auto One = MIRBuilder.buildConstant(ShTy, 1);
5750     if (IsFSHL) {
5751       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5752       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5753     } else {
5754       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5755       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5756     }
5757 
5758     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5759   }
5760 
5761   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5762   MI.eraseFromParent();
5763   return Legalized;
5764 }
5765 
5766 LegalizerHelper::LegalizeResult
5767 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5768   Register Dst = MI.getOperand(0).getReg();
5769   Register X = MI.getOperand(1).getReg();
5770   Register Y = MI.getOperand(2).getReg();
5771   Register Z = MI.getOperand(3).getReg();
5772   LLT Ty = MRI.getType(Dst);
5773   LLT ShTy = MRI.getType(Z);
5774 
5775   const unsigned BW = Ty.getScalarSizeInBits();
5776   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5777 
5778   Register ShX, ShY;
5779   Register ShAmt, InvShAmt;
5780 
5781   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5782   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5783     // fshl: X << C | Y >> (BW - C)
5784     // fshr: X << (BW - C) | Y >> C
5785     // where C = Z % BW is not zero
5786     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5787     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5788     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5789     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5790     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5791   } else {
5792     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5793     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5794     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5795     if (isPowerOf2_32(BW)) {
5796       // Z % BW -> Z & (BW - 1)
5797       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5798       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5799       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5800       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
5801     } else {
5802       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5803       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5804       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
5805     }
5806 
5807     auto One = MIRBuilder.buildConstant(ShTy, 1);
5808     if (IsFSHL) {
5809       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
5810       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
5811       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
5812     } else {
5813       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
5814       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
5815       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
5816     }
5817   }
5818 
5819   MIRBuilder.buildOr(Dst, ShX, ShY);
5820   MI.eraseFromParent();
5821   return Legalized;
5822 }
5823 
5824 LegalizerHelper::LegalizeResult
5825 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
5826   // These operations approximately do the following (while avoiding undefined
5827   // shifts by BW):
5828   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
5829   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
5830   Register Dst = MI.getOperand(0).getReg();
5831   LLT Ty = MRI.getType(Dst);
5832   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
5833 
5834   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5835   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5836 
5837   // TODO: Use smarter heuristic that accounts for vector legalization.
5838   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
5839     return lowerFunnelShiftAsShifts(MI);
5840 
5841   // This only works for powers of 2, fallback to shifts if it fails.
5842   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
5843   if (Result == UnableToLegalize)
5844     return lowerFunnelShiftAsShifts(MI);
5845   return Result;
5846 }
5847 
5848 LegalizerHelper::LegalizeResult
5849 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
5850   Register Dst = MI.getOperand(0).getReg();
5851   Register Src = MI.getOperand(1).getReg();
5852   Register Amt = MI.getOperand(2).getReg();
5853   LLT AmtTy = MRI.getType(Amt);
5854   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5855   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5856   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5857   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5858   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
5859   MI.eraseFromParent();
5860   return Legalized;
5861 }
5862 
5863 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
5864   Register Dst = MI.getOperand(0).getReg();
5865   Register Src = MI.getOperand(1).getReg();
5866   Register Amt = MI.getOperand(2).getReg();
5867   LLT DstTy = MRI.getType(Dst);
5868   LLT SrcTy = MRI.getType(Src);
5869   LLT AmtTy = MRI.getType(Amt);
5870 
5871   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
5872   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5873 
5874   MIRBuilder.setInstrAndDebugLoc(MI);
5875 
5876   // If a rotate in the other direction is supported, use it.
5877   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5878   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
5879       isPowerOf2_32(EltSizeInBits))
5880     return lowerRotateWithReverseRotate(MI);
5881 
5882   // If a funnel shift is supported, use it.
5883   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5884   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5885   bool IsFShLegal = false;
5886   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
5887       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
5888     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
5889                                 Register R3) {
5890       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
5891       MI.eraseFromParent();
5892       return Legalized;
5893     };
5894     // If a funnel shift in the other direction is supported, use it.
5895     if (IsFShLegal) {
5896       return buildFunnelShift(FShOpc, Dst, Src, Amt);
5897     } else if (isPowerOf2_32(EltSizeInBits)) {
5898       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
5899       return buildFunnelShift(RevFsh, Dst, Src, Amt);
5900     }
5901   }
5902 
5903   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5904   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
5905   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
5906   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
5907   Register ShVal;
5908   Register RevShiftVal;
5909   if (isPowerOf2_32(EltSizeInBits)) {
5910     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
5911     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
5912     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5913     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
5914     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5915     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
5916     RevShiftVal =
5917         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
5918   } else {
5919     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
5920     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
5921     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
5922     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
5923     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5924     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
5925     auto One = MIRBuilder.buildConstant(AmtTy, 1);
5926     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
5927     RevShiftVal =
5928         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
5929   }
5930   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
5931   MI.eraseFromParent();
5932   return Legalized;
5933 }
5934 
5935 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
5936 // representation.
5937 LegalizerHelper::LegalizeResult
5938 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
5939   Register Dst = MI.getOperand(0).getReg();
5940   Register Src = MI.getOperand(1).getReg();
5941   const LLT S64 = LLT::scalar(64);
5942   const LLT S32 = LLT::scalar(32);
5943   const LLT S1 = LLT::scalar(1);
5944 
5945   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
5946 
5947   // unsigned cul2f(ulong u) {
5948   //   uint lz = clz(u);
5949   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
5950   //   u = (u << lz) & 0x7fffffffffffffffUL;
5951   //   ulong t = u & 0xffffffffffUL;
5952   //   uint v = (e << 23) | (uint)(u >> 40);
5953   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
5954   //   return as_float(v + r);
5955   // }
5956 
5957   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
5958   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
5959 
5960   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
5961 
5962   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
5963   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
5964 
5965   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
5966   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
5967 
5968   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
5969   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
5970 
5971   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
5972 
5973   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
5974   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
5975 
5976   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
5977   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
5978   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
5979 
5980   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
5981   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
5982   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
5983   auto One = MIRBuilder.buildConstant(S32, 1);
5984 
5985   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
5986   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
5987   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
5988   MIRBuilder.buildAdd(Dst, V, R);
5989 
5990   MI.eraseFromParent();
5991   return Legalized;
5992 }
5993 
5994 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
5995   Register Dst = MI.getOperand(0).getReg();
5996   Register Src = MI.getOperand(1).getReg();
5997   LLT DstTy = MRI.getType(Dst);
5998   LLT SrcTy = MRI.getType(Src);
5999 
6000   if (SrcTy == LLT::scalar(1)) {
6001     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6002     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6003     MIRBuilder.buildSelect(Dst, Src, True, False);
6004     MI.eraseFromParent();
6005     return Legalized;
6006   }
6007 
6008   if (SrcTy != LLT::scalar(64))
6009     return UnableToLegalize;
6010 
6011   if (DstTy == LLT::scalar(32)) {
6012     // TODO: SelectionDAG has several alternative expansions to port which may
6013     // be more reasonble depending on the available instructions. If a target
6014     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6015     // intermediate type, this is probably worse.
6016     return lowerU64ToF32BitOps(MI);
6017   }
6018 
6019   return UnableToLegalize;
6020 }
6021 
6022 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6023   Register Dst = MI.getOperand(0).getReg();
6024   Register Src = MI.getOperand(1).getReg();
6025   LLT DstTy = MRI.getType(Dst);
6026   LLT SrcTy = MRI.getType(Src);
6027 
6028   const LLT S64 = LLT::scalar(64);
6029   const LLT S32 = LLT::scalar(32);
6030   const LLT S1 = LLT::scalar(1);
6031 
6032   if (SrcTy == S1) {
6033     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6034     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6035     MIRBuilder.buildSelect(Dst, Src, True, False);
6036     MI.eraseFromParent();
6037     return Legalized;
6038   }
6039 
6040   if (SrcTy != S64)
6041     return UnableToLegalize;
6042 
6043   if (DstTy == S32) {
6044     // signed cl2f(long l) {
6045     //   long s = l >> 63;
6046     //   float r = cul2f((l + s) ^ s);
6047     //   return s ? -r : r;
6048     // }
6049     Register L = Src;
6050     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6051     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6052 
6053     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6054     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6055     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6056 
6057     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6058     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6059                                             MIRBuilder.buildConstant(S64, 0));
6060     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6061     MI.eraseFromParent();
6062     return Legalized;
6063   }
6064 
6065   return UnableToLegalize;
6066 }
6067 
6068 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6069   Register Dst = MI.getOperand(0).getReg();
6070   Register Src = MI.getOperand(1).getReg();
6071   LLT DstTy = MRI.getType(Dst);
6072   LLT SrcTy = MRI.getType(Src);
6073   const LLT S64 = LLT::scalar(64);
6074   const LLT S32 = LLT::scalar(32);
6075 
6076   if (SrcTy != S64 && SrcTy != S32)
6077     return UnableToLegalize;
6078   if (DstTy != S32 && DstTy != S64)
6079     return UnableToLegalize;
6080 
6081   // FPTOSI gives same result as FPTOUI for positive signed integers.
6082   // FPTOUI needs to deal with fp values that convert to unsigned integers
6083   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6084 
6085   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6086   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6087                                                 : APFloat::IEEEdouble(),
6088                     APInt::getZero(SrcTy.getSizeInBits()));
6089   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6090 
6091   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6092 
6093   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6094   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6095   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6096   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6097   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6098   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6099   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6100 
6101   const LLT S1 = LLT::scalar(1);
6102 
6103   MachineInstrBuilder FCMP =
6104       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6105   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6106 
6107   MI.eraseFromParent();
6108   return Legalized;
6109 }
6110 
6111 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6112   Register Dst = MI.getOperand(0).getReg();
6113   Register Src = MI.getOperand(1).getReg();
6114   LLT DstTy = MRI.getType(Dst);
6115   LLT SrcTy = MRI.getType(Src);
6116   const LLT S64 = LLT::scalar(64);
6117   const LLT S32 = LLT::scalar(32);
6118 
6119   // FIXME: Only f32 to i64 conversions are supported.
6120   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6121     return UnableToLegalize;
6122 
6123   // Expand f32 -> i64 conversion
6124   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6125   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6126 
6127   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6128 
6129   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6130   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6131 
6132   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6133   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6134 
6135   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6136                                            APInt::getSignMask(SrcEltBits));
6137   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6138   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6139   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6140   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6141 
6142   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6143   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6144   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6145 
6146   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6147   R = MIRBuilder.buildZExt(DstTy, R);
6148 
6149   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6150   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6151   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6152   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6153 
6154   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6155   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6156 
6157   const LLT S1 = LLT::scalar(1);
6158   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6159                                     S1, Exponent, ExponentLoBit);
6160 
6161   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6162 
6163   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6164   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6165 
6166   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6167 
6168   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6169                                           S1, Exponent, ZeroSrcTy);
6170 
6171   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6172   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6173 
6174   MI.eraseFromParent();
6175   return Legalized;
6176 }
6177 
6178 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6179 LegalizerHelper::LegalizeResult
6180 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6181   Register Dst = MI.getOperand(0).getReg();
6182   Register Src = MI.getOperand(1).getReg();
6183 
6184   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6185     return UnableToLegalize;
6186 
6187   const unsigned ExpMask = 0x7ff;
6188   const unsigned ExpBiasf64 = 1023;
6189   const unsigned ExpBiasf16 = 15;
6190   const LLT S32 = LLT::scalar(32);
6191   const LLT S1 = LLT::scalar(1);
6192 
6193   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6194   Register U = Unmerge.getReg(0);
6195   Register UH = Unmerge.getReg(1);
6196 
6197   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6198   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6199 
6200   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6201   // add the f16 bias (15) to get the biased exponent for the f16 format.
6202   E = MIRBuilder.buildAdd(
6203     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6204 
6205   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6206   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6207 
6208   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6209                                        MIRBuilder.buildConstant(S32, 0x1ff));
6210   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6211 
6212   auto Zero = MIRBuilder.buildConstant(S32, 0);
6213   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6214   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6215   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6216 
6217   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6218   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6219   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6220   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6221 
6222   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6223   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6224 
6225   // N = M | (E << 12);
6226   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6227   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6228 
6229   // B = clamp(1-E, 0, 13);
6230   auto One = MIRBuilder.buildConstant(S32, 1);
6231   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6232   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6233   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6234 
6235   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6236                                        MIRBuilder.buildConstant(S32, 0x1000));
6237 
6238   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6239   auto D0 = MIRBuilder.buildShl(S32, D, B);
6240 
6241   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6242                                              D0, SigSetHigh);
6243   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6244   D = MIRBuilder.buildOr(S32, D, D1);
6245 
6246   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6247   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6248 
6249   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6250   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6251 
6252   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6253                                        MIRBuilder.buildConstant(S32, 3));
6254   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6255 
6256   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6257                                        MIRBuilder.buildConstant(S32, 5));
6258   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6259 
6260   V1 = MIRBuilder.buildOr(S32, V0, V1);
6261   V = MIRBuilder.buildAdd(S32, V, V1);
6262 
6263   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6264                                        E, MIRBuilder.buildConstant(S32, 30));
6265   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6266                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6267 
6268   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6269                                          E, MIRBuilder.buildConstant(S32, 1039));
6270   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6271 
6272   // Extract the sign bit.
6273   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6274   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6275 
6276   // Insert the sign bit
6277   V = MIRBuilder.buildOr(S32, Sign, V);
6278 
6279   MIRBuilder.buildTrunc(Dst, V);
6280   MI.eraseFromParent();
6281   return Legalized;
6282 }
6283 
6284 LegalizerHelper::LegalizeResult
6285 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6286   Register Dst = MI.getOperand(0).getReg();
6287   Register Src = MI.getOperand(1).getReg();
6288 
6289   LLT DstTy = MRI.getType(Dst);
6290   LLT SrcTy = MRI.getType(Src);
6291   const LLT S64 = LLT::scalar(64);
6292   const LLT S16 = LLT::scalar(16);
6293 
6294   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6295     return lowerFPTRUNC_F64_TO_F16(MI);
6296 
6297   return UnableToLegalize;
6298 }
6299 
6300 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6301 // multiplication tree.
6302 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6303   Register Dst = MI.getOperand(0).getReg();
6304   Register Src0 = MI.getOperand(1).getReg();
6305   Register Src1 = MI.getOperand(2).getReg();
6306   LLT Ty = MRI.getType(Dst);
6307 
6308   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6309   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6310   MI.eraseFromParent();
6311   return Legalized;
6312 }
6313 
6314 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6315   switch (Opc) {
6316   case TargetOpcode::G_SMIN:
6317     return CmpInst::ICMP_SLT;
6318   case TargetOpcode::G_SMAX:
6319     return CmpInst::ICMP_SGT;
6320   case TargetOpcode::G_UMIN:
6321     return CmpInst::ICMP_ULT;
6322   case TargetOpcode::G_UMAX:
6323     return CmpInst::ICMP_UGT;
6324   default:
6325     llvm_unreachable("not in integer min/max");
6326   }
6327 }
6328 
6329 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6330   Register Dst = MI.getOperand(0).getReg();
6331   Register Src0 = MI.getOperand(1).getReg();
6332   Register Src1 = MI.getOperand(2).getReg();
6333 
6334   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6335   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6336 
6337   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6338   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6339 
6340   MI.eraseFromParent();
6341   return Legalized;
6342 }
6343 
6344 LegalizerHelper::LegalizeResult
6345 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6346   Register Dst = MI.getOperand(0).getReg();
6347   Register Src0 = MI.getOperand(1).getReg();
6348   Register Src1 = MI.getOperand(2).getReg();
6349 
6350   const LLT Src0Ty = MRI.getType(Src0);
6351   const LLT Src1Ty = MRI.getType(Src1);
6352 
6353   const int Src0Size = Src0Ty.getScalarSizeInBits();
6354   const int Src1Size = Src1Ty.getScalarSizeInBits();
6355 
6356   auto SignBitMask = MIRBuilder.buildConstant(
6357     Src0Ty, APInt::getSignMask(Src0Size));
6358 
6359   auto NotSignBitMask = MIRBuilder.buildConstant(
6360     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6361 
6362   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6363   Register And1;
6364   if (Src0Ty == Src1Ty) {
6365     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6366   } else if (Src0Size > Src1Size) {
6367     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6368     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6369     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6370     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6371   } else {
6372     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6373     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6374     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6375     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6376   }
6377 
6378   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6379   // constants are a nan and -0.0, but the final result should preserve
6380   // everything.
6381   unsigned Flags = MI.getFlags();
6382   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6383 
6384   MI.eraseFromParent();
6385   return Legalized;
6386 }
6387 
6388 LegalizerHelper::LegalizeResult
6389 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6390   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6391     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6392 
6393   Register Dst = MI.getOperand(0).getReg();
6394   Register Src0 = MI.getOperand(1).getReg();
6395   Register Src1 = MI.getOperand(2).getReg();
6396   LLT Ty = MRI.getType(Dst);
6397 
6398   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6399     // Insert canonicalizes if it's possible we need to quiet to get correct
6400     // sNaN behavior.
6401 
6402     // Note this must be done here, and not as an optimization combine in the
6403     // absence of a dedicate quiet-snan instruction as we're using an
6404     // omni-purpose G_FCANONICALIZE.
6405     if (!isKnownNeverSNaN(Src0, MRI))
6406       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6407 
6408     if (!isKnownNeverSNaN(Src1, MRI))
6409       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6410   }
6411 
6412   // If there are no nans, it's safe to simply replace this with the non-IEEE
6413   // version.
6414   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6415   MI.eraseFromParent();
6416   return Legalized;
6417 }
6418 
6419 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6420   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6421   Register DstReg = MI.getOperand(0).getReg();
6422   LLT Ty = MRI.getType(DstReg);
6423   unsigned Flags = MI.getFlags();
6424 
6425   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6426                                   Flags);
6427   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6428   MI.eraseFromParent();
6429   return Legalized;
6430 }
6431 
6432 LegalizerHelper::LegalizeResult
6433 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6434   Register DstReg = MI.getOperand(0).getReg();
6435   Register X = MI.getOperand(1).getReg();
6436   const unsigned Flags = MI.getFlags();
6437   const LLT Ty = MRI.getType(DstReg);
6438   const LLT CondTy = Ty.changeElementSize(1);
6439 
6440   // round(x) =>
6441   //  t = trunc(x);
6442   //  d = fabs(x - t);
6443   //  o = copysign(1.0f, x);
6444   //  return t + (d >= 0.5 ? o : 0.0);
6445 
6446   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6447 
6448   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6449   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6450   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6451   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6452   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6453   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6454 
6455   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6456                                   Flags);
6457   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6458 
6459   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6460 
6461   MI.eraseFromParent();
6462   return Legalized;
6463 }
6464 
6465 LegalizerHelper::LegalizeResult
6466 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6467   Register DstReg = MI.getOperand(0).getReg();
6468   Register SrcReg = MI.getOperand(1).getReg();
6469   unsigned Flags = MI.getFlags();
6470   LLT Ty = MRI.getType(DstReg);
6471   const LLT CondTy = Ty.changeElementSize(1);
6472 
6473   // result = trunc(src);
6474   // if (src < 0.0 && src != result)
6475   //   result += -1.0.
6476 
6477   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6478   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6479 
6480   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6481                                   SrcReg, Zero, Flags);
6482   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6483                                       SrcReg, Trunc, Flags);
6484   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6485   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6486 
6487   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6488   MI.eraseFromParent();
6489   return Legalized;
6490 }
6491 
6492 LegalizerHelper::LegalizeResult
6493 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6494   const unsigned NumOps = MI.getNumOperands();
6495   Register DstReg = MI.getOperand(0).getReg();
6496   Register Src0Reg = MI.getOperand(1).getReg();
6497   LLT DstTy = MRI.getType(DstReg);
6498   LLT SrcTy = MRI.getType(Src0Reg);
6499   unsigned PartSize = SrcTy.getSizeInBits();
6500 
6501   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6502   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6503 
6504   for (unsigned I = 2; I != NumOps; ++I) {
6505     const unsigned Offset = (I - 1) * PartSize;
6506 
6507     Register SrcReg = MI.getOperand(I).getReg();
6508     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6509 
6510     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6511       MRI.createGenericVirtualRegister(WideTy);
6512 
6513     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6514     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6515     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6516     ResultReg = NextResult;
6517   }
6518 
6519   if (DstTy.isPointer()) {
6520     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6521           DstTy.getAddressSpace())) {
6522       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6523       return UnableToLegalize;
6524     }
6525 
6526     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6527   }
6528 
6529   MI.eraseFromParent();
6530   return Legalized;
6531 }
6532 
6533 LegalizerHelper::LegalizeResult
6534 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6535   const unsigned NumDst = MI.getNumOperands() - 1;
6536   Register SrcReg = MI.getOperand(NumDst).getReg();
6537   Register Dst0Reg = MI.getOperand(0).getReg();
6538   LLT DstTy = MRI.getType(Dst0Reg);
6539   if (DstTy.isPointer())
6540     return UnableToLegalize; // TODO
6541 
6542   SrcReg = coerceToScalar(SrcReg);
6543   if (!SrcReg)
6544     return UnableToLegalize;
6545 
6546   // Expand scalarizing unmerge as bitcast to integer and shift.
6547   LLT IntTy = MRI.getType(SrcReg);
6548 
6549   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6550 
6551   const unsigned DstSize = DstTy.getSizeInBits();
6552   unsigned Offset = DstSize;
6553   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6554     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6555     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6556     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6557   }
6558 
6559   MI.eraseFromParent();
6560   return Legalized;
6561 }
6562 
6563 /// Lower a vector extract or insert by writing the vector to a stack temporary
6564 /// and reloading the element or vector.
6565 ///
6566 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6567 ///  =>
6568 ///  %stack_temp = G_FRAME_INDEX
6569 ///  G_STORE %vec, %stack_temp
6570 ///  %idx = clamp(%idx, %vec.getNumElements())
6571 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6572 ///  %dst = G_LOAD %element_ptr
6573 LegalizerHelper::LegalizeResult
6574 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6575   Register DstReg = MI.getOperand(0).getReg();
6576   Register SrcVec = MI.getOperand(1).getReg();
6577   Register InsertVal;
6578   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6579     InsertVal = MI.getOperand(2).getReg();
6580 
6581   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6582 
6583   LLT VecTy = MRI.getType(SrcVec);
6584   LLT EltTy = VecTy.getElementType();
6585   unsigned NumElts = VecTy.getNumElements();
6586 
6587   int64_t IdxVal;
6588   if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
6589     SmallVector<Register, 8> SrcRegs;
6590     extractParts(SrcVec, EltTy, NumElts, SrcRegs);
6591 
6592     if (InsertVal) {
6593       SrcRegs[IdxVal] = MI.getOperand(2).getReg();
6594       MIRBuilder.buildMerge(DstReg, SrcRegs);
6595     } else {
6596       MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
6597     }
6598 
6599     MI.eraseFromParent();
6600     return Legalized;
6601   }
6602 
6603   if (!EltTy.isByteSized()) { // Not implemented.
6604     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6605     return UnableToLegalize;
6606   }
6607 
6608   unsigned EltBytes = EltTy.getSizeInBytes();
6609   Align VecAlign = getStackTemporaryAlignment(VecTy);
6610   Align EltAlign;
6611 
6612   MachinePointerInfo PtrInfo;
6613   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6614                                         VecAlign, PtrInfo);
6615   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6616 
6617   // Get the pointer to the element, and be sure not to hit undefined behavior
6618   // if the index is out of bounds.
6619   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6620 
6621   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6622     int64_t Offset = IdxVal * EltBytes;
6623     PtrInfo = PtrInfo.getWithOffset(Offset);
6624     EltAlign = commonAlignment(VecAlign, Offset);
6625   } else {
6626     // We lose information with a variable offset.
6627     EltAlign = getStackTemporaryAlignment(EltTy);
6628     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6629   }
6630 
6631   if (InsertVal) {
6632     // Write the inserted element
6633     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6634 
6635     // Reload the whole vector.
6636     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6637   } else {
6638     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6639   }
6640 
6641   MI.eraseFromParent();
6642   return Legalized;
6643 }
6644 
6645 LegalizerHelper::LegalizeResult
6646 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6647   Register DstReg = MI.getOperand(0).getReg();
6648   Register Src0Reg = MI.getOperand(1).getReg();
6649   Register Src1Reg = MI.getOperand(2).getReg();
6650   LLT Src0Ty = MRI.getType(Src0Reg);
6651   LLT DstTy = MRI.getType(DstReg);
6652   LLT IdxTy = LLT::scalar(32);
6653 
6654   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6655 
6656   if (DstTy.isScalar()) {
6657     if (Src0Ty.isVector())
6658       return UnableToLegalize;
6659 
6660     // This is just a SELECT.
6661     assert(Mask.size() == 1 && "Expected a single mask element");
6662     Register Val;
6663     if (Mask[0] < 0 || Mask[0] > 1)
6664       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6665     else
6666       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6667     MIRBuilder.buildCopy(DstReg, Val);
6668     MI.eraseFromParent();
6669     return Legalized;
6670   }
6671 
6672   Register Undef;
6673   SmallVector<Register, 32> BuildVec;
6674   LLT EltTy = DstTy.getElementType();
6675 
6676   for (int Idx : Mask) {
6677     if (Idx < 0) {
6678       if (!Undef.isValid())
6679         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6680       BuildVec.push_back(Undef);
6681       continue;
6682     }
6683 
6684     if (Src0Ty.isScalar()) {
6685       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6686     } else {
6687       int NumElts = Src0Ty.getNumElements();
6688       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6689       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6690       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6691       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6692       BuildVec.push_back(Extract.getReg(0));
6693     }
6694   }
6695 
6696   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6697   MI.eraseFromParent();
6698   return Legalized;
6699 }
6700 
6701 LegalizerHelper::LegalizeResult
6702 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6703   const auto &MF = *MI.getMF();
6704   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6705   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6706     return UnableToLegalize;
6707 
6708   Register Dst = MI.getOperand(0).getReg();
6709   Register AllocSize = MI.getOperand(1).getReg();
6710   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6711 
6712   LLT PtrTy = MRI.getType(Dst);
6713   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6714 
6715   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6716   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6717   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6718 
6719   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6720   // have to generate an extra instruction to negate the alloc and then use
6721   // G_PTR_ADD to add the negative offset.
6722   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6723   if (Alignment > Align(1)) {
6724     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6725     AlignMask.negate();
6726     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6727     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6728   }
6729 
6730   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6731   MIRBuilder.buildCopy(SPReg, SPTmp);
6732   MIRBuilder.buildCopy(Dst, SPTmp);
6733 
6734   MI.eraseFromParent();
6735   return Legalized;
6736 }
6737 
6738 LegalizerHelper::LegalizeResult
6739 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6740   Register Dst = MI.getOperand(0).getReg();
6741   Register Src = MI.getOperand(1).getReg();
6742   unsigned Offset = MI.getOperand(2).getImm();
6743 
6744   LLT DstTy = MRI.getType(Dst);
6745   LLT SrcTy = MRI.getType(Src);
6746 
6747   // Extract sub-vector or one element
6748   if (SrcTy.isVector()) {
6749     unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
6750     unsigned DstSize = DstTy.getSizeInBits();
6751 
6752     if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
6753         (Offset + DstSize <= SrcTy.getSizeInBits())) {
6754       // Unmerge and allow access to each Src element for the artifact combiner.
6755       auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), Src);
6756 
6757       // Take element(s) we need to extract and copy it (merge them).
6758       SmallVector<Register, 8> SubVectorElts;
6759       for (unsigned Idx = Offset / SrcEltSize;
6760            Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
6761         SubVectorElts.push_back(Unmerge.getReg(Idx));
6762       }
6763       if (SubVectorElts.size() == 1)
6764         MIRBuilder.buildCopy(Dst, SubVectorElts[0]);
6765       else
6766         MIRBuilder.buildMerge(Dst, SubVectorElts);
6767 
6768       MI.eraseFromParent();
6769       return Legalized;
6770     }
6771   }
6772 
6773   if (DstTy.isScalar() &&
6774       (SrcTy.isScalar() ||
6775        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6776     LLT SrcIntTy = SrcTy;
6777     if (!SrcTy.isScalar()) {
6778       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6779       Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6780     }
6781 
6782     if (Offset == 0)
6783       MIRBuilder.buildTrunc(Dst, Src);
6784     else {
6785       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6786       auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6787       MIRBuilder.buildTrunc(Dst, Shr);
6788     }
6789 
6790     MI.eraseFromParent();
6791     return Legalized;
6792   }
6793 
6794   return UnableToLegalize;
6795 }
6796 
6797 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6798   Register Dst = MI.getOperand(0).getReg();
6799   Register Src = MI.getOperand(1).getReg();
6800   Register InsertSrc = MI.getOperand(2).getReg();
6801   uint64_t Offset = MI.getOperand(3).getImm();
6802 
6803   LLT DstTy = MRI.getType(Src);
6804   LLT InsertTy = MRI.getType(InsertSrc);
6805 
6806   // Insert sub-vector or one element
6807   if (DstTy.isVector() && !InsertTy.isPointer()) {
6808     LLT EltTy = DstTy.getElementType();
6809     unsigned EltSize = EltTy.getSizeInBits();
6810     unsigned InsertSize = InsertTy.getSizeInBits();
6811 
6812     if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
6813         (Offset + InsertSize <= DstTy.getSizeInBits())) {
6814       auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
6815       SmallVector<Register, 8> DstElts;
6816       unsigned Idx = 0;
6817       // Elements from Src before insert start Offset
6818       for (; Idx < Offset / EltSize; ++Idx) {
6819         DstElts.push_back(UnmergeSrc.getReg(Idx));
6820       }
6821 
6822       // Replace elements in Src with elements from InsertSrc
6823       if (InsertTy.getSizeInBits() > EltSize) {
6824         auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
6825         for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
6826              ++Idx, ++i) {
6827           DstElts.push_back(UnmergeInsertSrc.getReg(i));
6828         }
6829       } else {
6830         DstElts.push_back(InsertSrc);
6831         ++Idx;
6832       }
6833 
6834       // Remaining elements from Src after insert
6835       for (; Idx < DstTy.getNumElements(); ++Idx) {
6836         DstElts.push_back(UnmergeSrc.getReg(Idx));
6837       }
6838 
6839       MIRBuilder.buildMerge(Dst, DstElts);
6840       MI.eraseFromParent();
6841       return Legalized;
6842     }
6843   }
6844 
6845   if (InsertTy.isVector() ||
6846       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6847     return UnableToLegalize;
6848 
6849   const DataLayout &DL = MIRBuilder.getDataLayout();
6850   if ((DstTy.isPointer() &&
6851        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6852       (InsertTy.isPointer() &&
6853        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6854     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6855     return UnableToLegalize;
6856   }
6857 
6858   LLT IntDstTy = DstTy;
6859 
6860   if (!DstTy.isScalar()) {
6861     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6862     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6863   }
6864 
6865   if (!InsertTy.isScalar()) {
6866     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6867     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6868   }
6869 
6870   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6871   if (Offset != 0) {
6872     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6873     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6874   }
6875 
6876   APInt MaskVal = APInt::getBitsSetWithWrap(
6877       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6878 
6879   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6880   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6881   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6882 
6883   MIRBuilder.buildCast(Dst, Or);
6884   MI.eraseFromParent();
6885   return Legalized;
6886 }
6887 
6888 LegalizerHelper::LegalizeResult
6889 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
6890   Register Dst0 = MI.getOperand(0).getReg();
6891   Register Dst1 = MI.getOperand(1).getReg();
6892   Register LHS = MI.getOperand(2).getReg();
6893   Register RHS = MI.getOperand(3).getReg();
6894   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
6895 
6896   LLT Ty = MRI.getType(Dst0);
6897   LLT BoolTy = MRI.getType(Dst1);
6898 
6899   if (IsAdd)
6900     MIRBuilder.buildAdd(Dst0, LHS, RHS);
6901   else
6902     MIRBuilder.buildSub(Dst0, LHS, RHS);
6903 
6904   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
6905 
6906   auto Zero = MIRBuilder.buildConstant(Ty, 0);
6907 
6908   // For an addition, the result should be less than one of the operands (LHS)
6909   // if and only if the other operand (RHS) is negative, otherwise there will
6910   // be overflow.
6911   // For a subtraction, the result should be less than one of the operands
6912   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
6913   // otherwise there will be overflow.
6914   auto ResultLowerThanLHS =
6915       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
6916   auto ConditionRHS = MIRBuilder.buildICmp(
6917       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
6918 
6919   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
6920   MI.eraseFromParent();
6921   return Legalized;
6922 }
6923 
6924 LegalizerHelper::LegalizeResult
6925 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
6926   Register Res = MI.getOperand(0).getReg();
6927   Register LHS = MI.getOperand(1).getReg();
6928   Register RHS = MI.getOperand(2).getReg();
6929   LLT Ty = MRI.getType(Res);
6930   bool IsSigned;
6931   bool IsAdd;
6932   unsigned BaseOp;
6933   switch (MI.getOpcode()) {
6934   default:
6935     llvm_unreachable("unexpected addsat/subsat opcode");
6936   case TargetOpcode::G_UADDSAT:
6937     IsSigned = false;
6938     IsAdd = true;
6939     BaseOp = TargetOpcode::G_ADD;
6940     break;
6941   case TargetOpcode::G_SADDSAT:
6942     IsSigned = true;
6943     IsAdd = true;
6944     BaseOp = TargetOpcode::G_ADD;
6945     break;
6946   case TargetOpcode::G_USUBSAT:
6947     IsSigned = false;
6948     IsAdd = false;
6949     BaseOp = TargetOpcode::G_SUB;
6950     break;
6951   case TargetOpcode::G_SSUBSAT:
6952     IsSigned = true;
6953     IsAdd = false;
6954     BaseOp = TargetOpcode::G_SUB;
6955     break;
6956   }
6957 
6958   if (IsSigned) {
6959     // sadd.sat(a, b) ->
6960     //   hi = 0x7fffffff - smax(a, 0)
6961     //   lo = 0x80000000 - smin(a, 0)
6962     //   a + smin(smax(lo, b), hi)
6963     // ssub.sat(a, b) ->
6964     //   lo = smax(a, -1) - 0x7fffffff
6965     //   hi = smin(a, -1) - 0x80000000
6966     //   a - smin(smax(lo, b), hi)
6967     // TODO: AMDGPU can use a "median of 3" instruction here:
6968     //   a +/- med3(lo, b, hi)
6969     uint64_t NumBits = Ty.getScalarSizeInBits();
6970     auto MaxVal =
6971         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
6972     auto MinVal =
6973         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
6974     MachineInstrBuilder Hi, Lo;
6975     if (IsAdd) {
6976       auto Zero = MIRBuilder.buildConstant(Ty, 0);
6977       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
6978       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
6979     } else {
6980       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
6981       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
6982                                MaxVal);
6983       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
6984                                MinVal);
6985     }
6986     auto RHSClamped =
6987         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
6988     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
6989   } else {
6990     // uadd.sat(a, b) -> a + umin(~a, b)
6991     // usub.sat(a, b) -> a - umin(a, b)
6992     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
6993     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
6994     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
6995   }
6996 
6997   MI.eraseFromParent();
6998   return Legalized;
6999 }
7000 
7001 LegalizerHelper::LegalizeResult
7002 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7003   Register Res = MI.getOperand(0).getReg();
7004   Register LHS = MI.getOperand(1).getReg();
7005   Register RHS = MI.getOperand(2).getReg();
7006   LLT Ty = MRI.getType(Res);
7007   LLT BoolTy = Ty.changeElementSize(1);
7008   bool IsSigned;
7009   bool IsAdd;
7010   unsigned OverflowOp;
7011   switch (MI.getOpcode()) {
7012   default:
7013     llvm_unreachable("unexpected addsat/subsat opcode");
7014   case TargetOpcode::G_UADDSAT:
7015     IsSigned = false;
7016     IsAdd = true;
7017     OverflowOp = TargetOpcode::G_UADDO;
7018     break;
7019   case TargetOpcode::G_SADDSAT:
7020     IsSigned = true;
7021     IsAdd = true;
7022     OverflowOp = TargetOpcode::G_SADDO;
7023     break;
7024   case TargetOpcode::G_USUBSAT:
7025     IsSigned = false;
7026     IsAdd = false;
7027     OverflowOp = TargetOpcode::G_USUBO;
7028     break;
7029   case TargetOpcode::G_SSUBSAT:
7030     IsSigned = true;
7031     IsAdd = false;
7032     OverflowOp = TargetOpcode::G_SSUBO;
7033     break;
7034   }
7035 
7036   auto OverflowRes =
7037       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7038   Register Tmp = OverflowRes.getReg(0);
7039   Register Ov = OverflowRes.getReg(1);
7040   MachineInstrBuilder Clamp;
7041   if (IsSigned) {
7042     // sadd.sat(a, b) ->
7043     //   {tmp, ov} = saddo(a, b)
7044     //   ov ? (tmp >>s 31) + 0x80000000 : r
7045     // ssub.sat(a, b) ->
7046     //   {tmp, ov} = ssubo(a, b)
7047     //   ov ? (tmp >>s 31) + 0x80000000 : r
7048     uint64_t NumBits = Ty.getScalarSizeInBits();
7049     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7050     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7051     auto MinVal =
7052         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7053     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7054   } else {
7055     // uadd.sat(a, b) ->
7056     //   {tmp, ov} = uaddo(a, b)
7057     //   ov ? 0xffffffff : tmp
7058     // usub.sat(a, b) ->
7059     //   {tmp, ov} = usubo(a, b)
7060     //   ov ? 0 : tmp
7061     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7062   }
7063   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7064 
7065   MI.eraseFromParent();
7066   return Legalized;
7067 }
7068 
7069 LegalizerHelper::LegalizeResult
7070 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7071   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7072           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7073          "Expected shlsat opcode!");
7074   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7075   Register Res = MI.getOperand(0).getReg();
7076   Register LHS = MI.getOperand(1).getReg();
7077   Register RHS = MI.getOperand(2).getReg();
7078   LLT Ty = MRI.getType(Res);
7079   LLT BoolTy = Ty.changeElementSize(1);
7080 
7081   unsigned BW = Ty.getScalarSizeInBits();
7082   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7083   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7084                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7085 
7086   MachineInstrBuilder SatVal;
7087   if (IsSigned) {
7088     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7089     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7090     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7091                                     MIRBuilder.buildConstant(Ty, 0));
7092     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7093   } else {
7094     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7095   }
7096   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7097   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7098 
7099   MI.eraseFromParent();
7100   return Legalized;
7101 }
7102 
7103 LegalizerHelper::LegalizeResult
7104 LegalizerHelper::lowerBswap(MachineInstr &MI) {
7105   Register Dst = MI.getOperand(0).getReg();
7106   Register Src = MI.getOperand(1).getReg();
7107   const LLT Ty = MRI.getType(Src);
7108   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7109   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7110 
7111   // Swap most and least significant byte, set remaining bytes in Res to zero.
7112   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7113   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7114   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7115   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7116 
7117   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7118   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7119     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7120     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7121     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7122     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7123     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7124     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7125     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7126     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7127     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7128     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7129     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7130     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7131   }
7132   Res.getInstr()->getOperand(0).setReg(Dst);
7133 
7134   MI.eraseFromParent();
7135   return Legalized;
7136 }
7137 
7138 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7139 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7140                                  MachineInstrBuilder Src, APInt Mask) {
7141   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7142   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7143   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7144   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7145   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7146   return B.buildOr(Dst, LHS, RHS);
7147 }
7148 
7149 LegalizerHelper::LegalizeResult
7150 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7151   Register Dst = MI.getOperand(0).getReg();
7152   Register Src = MI.getOperand(1).getReg();
7153   const LLT Ty = MRI.getType(Src);
7154   unsigned Size = Ty.getSizeInBits();
7155 
7156   MachineInstrBuilder BSWAP =
7157       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7158 
7159   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7160   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7161   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7162   MachineInstrBuilder Swap4 =
7163       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7164 
7165   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7166   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7167   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7168   MachineInstrBuilder Swap2 =
7169       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7170 
7171   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7172   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7173   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7174   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7175 
7176   MI.eraseFromParent();
7177   return Legalized;
7178 }
7179 
7180 LegalizerHelper::LegalizeResult
7181 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7182   MachineFunction &MF = MIRBuilder.getMF();
7183 
7184   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7185   int NameOpIdx = IsRead ? 1 : 0;
7186   int ValRegIndex = IsRead ? 0 : 1;
7187 
7188   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7189   const LLT Ty = MRI.getType(ValReg);
7190   const MDString *RegStr = cast<MDString>(
7191     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7192 
7193   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7194   if (!PhysReg.isValid())
7195     return UnableToLegalize;
7196 
7197   if (IsRead)
7198     MIRBuilder.buildCopy(ValReg, PhysReg);
7199   else
7200     MIRBuilder.buildCopy(PhysReg, ValReg);
7201 
7202   MI.eraseFromParent();
7203   return Legalized;
7204 }
7205 
7206 LegalizerHelper::LegalizeResult
7207 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7208   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7209   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7210   Register Result = MI.getOperand(0).getReg();
7211   LLT OrigTy = MRI.getType(Result);
7212   auto SizeInBits = OrigTy.getScalarSizeInBits();
7213   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7214 
7215   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7216   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7217   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7218   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7219 
7220   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7221   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7222   MIRBuilder.buildTrunc(Result, Shifted);
7223 
7224   MI.eraseFromParent();
7225   return Legalized;
7226 }
7227 
7228 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7229   // Implement vector G_SELECT in terms of XOR, AND, OR.
7230   Register DstReg = MI.getOperand(0).getReg();
7231   Register MaskReg = MI.getOperand(1).getReg();
7232   Register Op1Reg = MI.getOperand(2).getReg();
7233   Register Op2Reg = MI.getOperand(3).getReg();
7234   LLT DstTy = MRI.getType(DstReg);
7235   LLT MaskTy = MRI.getType(MaskReg);
7236   LLT Op1Ty = MRI.getType(Op1Reg);
7237   if (!DstTy.isVector())
7238     return UnableToLegalize;
7239 
7240   // Vector selects can have a scalar predicate. If so, splat into a vector and
7241   // finish for later legalization attempts to try again.
7242   if (MaskTy.isScalar()) {
7243     Register MaskElt = MaskReg;
7244     if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
7245       MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
7246     // Generate a vector splat idiom to be pattern matched later.
7247     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7248     Observer.changingInstr(MI);
7249     MI.getOperand(1).setReg(ShufSplat.getReg(0));
7250     Observer.changedInstr(MI);
7251     return Legalized;
7252   }
7253 
7254   if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
7255     return UnableToLegalize;
7256   }
7257 
7258   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7259   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7260   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7261   MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7262   MI.eraseFromParent();
7263   return Legalized;
7264 }
7265 
7266 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7267   // Split DIVREM into individual instructions.
7268   unsigned Opcode = MI.getOpcode();
7269 
7270   MIRBuilder.buildInstr(
7271       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7272                                         : TargetOpcode::G_UDIV,
7273       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7274   MIRBuilder.buildInstr(
7275       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7276                                         : TargetOpcode::G_UREM,
7277       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7278   MI.eraseFromParent();
7279   return Legalized;
7280 }
7281 
7282 LegalizerHelper::LegalizeResult
7283 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7284   // Expand %res = G_ABS %a into:
7285   // %v1 = G_ASHR %a, scalar_size-1
7286   // %v2 = G_ADD %a, %v1
7287   // %res = G_XOR %v2, %v1
7288   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7289   Register OpReg = MI.getOperand(1).getReg();
7290   auto ShiftAmt =
7291       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7292   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7293   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7294   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7295   MI.eraseFromParent();
7296   return Legalized;
7297 }
7298 
7299 LegalizerHelper::LegalizeResult
7300 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7301   // Expand %res = G_ABS %a into:
7302   // %v1 = G_CONSTANT 0
7303   // %v2 = G_SUB %v1, %a
7304   // %res = G_SMAX %a, %v2
7305   Register SrcReg = MI.getOperand(1).getReg();
7306   LLT Ty = MRI.getType(SrcReg);
7307   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7308   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7309   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7310   MI.eraseFromParent();
7311   return Legalized;
7312 }
7313 
7314 LegalizerHelper::LegalizeResult
7315 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
7316   Register SrcReg = MI.getOperand(1).getReg();
7317   LLT SrcTy = MRI.getType(SrcReg);
7318   LLT DstTy = MRI.getType(SrcReg);
7319 
7320   // The source could be a scalar if the IR type was <1 x sN>.
7321   if (SrcTy.isScalar()) {
7322     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
7323       return UnableToLegalize; // FIXME: handle extension.
7324     // This can be just a plain copy.
7325     Observer.changingInstr(MI);
7326     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
7327     Observer.changedInstr(MI);
7328     return Legalized;
7329   }
7330   return UnableToLegalize;;
7331 }
7332 
7333 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
7334   // On Darwin, -Os means optimize for size without hurting performance, so
7335   // only really optimize for size when -Oz (MinSize) is used.
7336   if (MF.getTarget().getTargetTriple().isOSDarwin())
7337     return MF.getFunction().hasMinSize();
7338   return MF.getFunction().hasOptSize();
7339 }
7340 
7341 // Returns a list of types to use for memory op lowering in MemOps. A partial
7342 // port of findOptimalMemOpLowering in TargetLowering.
7343 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
7344                                           unsigned Limit, const MemOp &Op,
7345                                           unsigned DstAS, unsigned SrcAS,
7346                                           const AttributeList &FuncAttributes,
7347                                           const TargetLowering &TLI) {
7348   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
7349     return false;
7350 
7351   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
7352 
7353   if (Ty == LLT()) {
7354     // Use the largest scalar type whose alignment constraints are satisfied.
7355     // We only need to check DstAlign here as SrcAlign is always greater or
7356     // equal to DstAlign (or zero).
7357     Ty = LLT::scalar(64);
7358     if (Op.isFixedDstAlign())
7359       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
7360              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
7361         Ty = LLT::scalar(Ty.getSizeInBytes());
7362     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
7363     // FIXME: check for the largest legal type we can load/store to.
7364   }
7365 
7366   unsigned NumMemOps = 0;
7367   uint64_t Size = Op.size();
7368   while (Size) {
7369     unsigned TySize = Ty.getSizeInBytes();
7370     while (TySize > Size) {
7371       // For now, only use non-vector load / store's for the left-over pieces.
7372       LLT NewTy = Ty;
7373       // FIXME: check for mem op safety and legality of the types. Not all of
7374       // SDAGisms map cleanly to GISel concepts.
7375       if (NewTy.isVector())
7376         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
7377       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
7378       unsigned NewTySize = NewTy.getSizeInBytes();
7379       assert(NewTySize > 0 && "Could not find appropriate type");
7380 
7381       // If the new LLT cannot cover all of the remaining bits, then consider
7382       // issuing a (or a pair of) unaligned and overlapping load / store.
7383       bool Fast;
7384       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
7385       MVT VT = getMVTForLLT(Ty);
7386       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
7387           TLI.allowsMisalignedMemoryAccesses(
7388               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
7389               MachineMemOperand::MONone, &Fast) &&
7390           Fast)
7391         TySize = Size;
7392       else {
7393         Ty = NewTy;
7394         TySize = NewTySize;
7395       }
7396     }
7397 
7398     if (++NumMemOps > Limit)
7399       return false;
7400 
7401     MemOps.push_back(Ty);
7402     Size -= TySize;
7403   }
7404 
7405   return true;
7406 }
7407 
7408 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
7409   if (Ty.isVector())
7410     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
7411                                 Ty.getNumElements());
7412   return IntegerType::get(C, Ty.getSizeInBits());
7413 }
7414 
7415 // Get a vectorized representation of the memset value operand, GISel edition.
7416 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
7417   MachineRegisterInfo &MRI = *MIB.getMRI();
7418   unsigned NumBits = Ty.getScalarSizeInBits();
7419   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7420   if (!Ty.isVector() && ValVRegAndVal) {
7421     APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
7422     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
7423     return MIB.buildConstant(Ty, SplatVal).getReg(0);
7424   }
7425 
7426   // Extend the byte value to the larger type, and then multiply by a magic
7427   // value 0x010101... in order to replicate it across every byte.
7428   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
7429   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
7430     return MIB.buildConstant(Ty, 0).getReg(0);
7431   }
7432 
7433   LLT ExtType = Ty.getScalarType();
7434   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
7435   if (NumBits > 8) {
7436     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
7437     auto MagicMI = MIB.buildConstant(ExtType, Magic);
7438     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
7439   }
7440 
7441   // For vector types create a G_BUILD_VECTOR.
7442   if (Ty.isVector())
7443     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
7444 
7445   return Val;
7446 }
7447 
7448 LegalizerHelper::LegalizeResult
7449 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
7450                              uint64_t KnownLen, Align Alignment,
7451                              bool IsVolatile) {
7452   auto &MF = *MI.getParent()->getParent();
7453   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7454   auto &DL = MF.getDataLayout();
7455   LLVMContext &C = MF.getFunction().getContext();
7456 
7457   assert(KnownLen != 0 && "Have a zero length memset length!");
7458 
7459   bool DstAlignCanChange = false;
7460   MachineFrameInfo &MFI = MF.getFrameInfo();
7461   bool OptSize = shouldLowerMemFuncForSize(MF);
7462 
7463   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7464   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7465     DstAlignCanChange = true;
7466 
7467   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
7468   std::vector<LLT> MemOps;
7469 
7470   const auto &DstMMO = **MI.memoperands_begin();
7471   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7472 
7473   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7474   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
7475 
7476   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
7477                                      MemOp::Set(KnownLen, DstAlignCanChange,
7478                                                 Alignment,
7479                                                 /*IsZeroMemset=*/IsZeroVal,
7480                                                 /*IsVolatile=*/IsVolatile),
7481                                      DstPtrInfo.getAddrSpace(), ~0u,
7482                                      MF.getFunction().getAttributes(), TLI))
7483     return UnableToLegalize;
7484 
7485   if (DstAlignCanChange) {
7486     // Get an estimate of the type from the LLT.
7487     Type *IRTy = getTypeForLLT(MemOps[0], C);
7488     Align NewAlign = DL.getABITypeAlign(IRTy);
7489     if (NewAlign > Alignment) {
7490       Alignment = NewAlign;
7491       unsigned FI = FIDef->getOperand(1).getIndex();
7492       // Give the stack frame object a larger alignment if needed.
7493       if (MFI.getObjectAlign(FI) < Alignment)
7494         MFI.setObjectAlignment(FI, Alignment);
7495     }
7496   }
7497 
7498   MachineIRBuilder MIB(MI);
7499   // Find the largest store and generate the bit pattern for it.
7500   LLT LargestTy = MemOps[0];
7501   for (unsigned i = 1; i < MemOps.size(); i++)
7502     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
7503       LargestTy = MemOps[i];
7504 
7505   // The memset stored value is always defined as an s8, so in order to make it
7506   // work with larger store types we need to repeat the bit pattern across the
7507   // wider type.
7508   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
7509 
7510   if (!MemSetValue)
7511     return UnableToLegalize;
7512 
7513   // Generate the stores. For each store type in the list, we generate the
7514   // matching store of that type to the destination address.
7515   LLT PtrTy = MRI.getType(Dst);
7516   unsigned DstOff = 0;
7517   unsigned Size = KnownLen;
7518   for (unsigned I = 0; I < MemOps.size(); I++) {
7519     LLT Ty = MemOps[I];
7520     unsigned TySize = Ty.getSizeInBytes();
7521     if (TySize > Size) {
7522       // Issuing an unaligned load / store pair that overlaps with the previous
7523       // pair. Adjust the offset accordingly.
7524       assert(I == MemOps.size() - 1 && I != 0);
7525       DstOff -= TySize - Size;
7526     }
7527 
7528     // If this store is smaller than the largest store see whether we can get
7529     // the smaller value for free with a truncate.
7530     Register Value = MemSetValue;
7531     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
7532       MVT VT = getMVTForLLT(Ty);
7533       MVT LargestVT = getMVTForLLT(LargestTy);
7534       if (!LargestTy.isVector() && !Ty.isVector() &&
7535           TLI.isTruncateFree(LargestVT, VT))
7536         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
7537       else
7538         Value = getMemsetValue(Val, Ty, MIB);
7539       if (!Value)
7540         return UnableToLegalize;
7541     }
7542 
7543     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
7544 
7545     Register Ptr = Dst;
7546     if (DstOff != 0) {
7547       auto Offset =
7548           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
7549       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7550     }
7551 
7552     MIB.buildStore(Value, Ptr, *StoreMMO);
7553     DstOff += Ty.getSizeInBytes();
7554     Size -= TySize;
7555   }
7556 
7557   MI.eraseFromParent();
7558   return Legalized;
7559 }
7560 
7561 LegalizerHelper::LegalizeResult
7562 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
7563   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7564 
7565   Register Dst = MI.getOperand(0).getReg();
7566   Register Src = MI.getOperand(1).getReg();
7567   Register Len = MI.getOperand(2).getReg();
7568 
7569   const auto *MMOIt = MI.memoperands_begin();
7570   const MachineMemOperand *MemOp = *MMOIt;
7571   bool IsVolatile = MemOp->isVolatile();
7572 
7573   // See if this is a constant length copy
7574   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7575   // FIXME: support dynamically sized G_MEMCPY_INLINE
7576   assert(LenVRegAndVal.hasValue() &&
7577          "inline memcpy with dynamic size is not yet supported");
7578   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7579   if (KnownLen == 0) {
7580     MI.eraseFromParent();
7581     return Legalized;
7582   }
7583 
7584   const auto &DstMMO = **MI.memoperands_begin();
7585   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7586   Align DstAlign = DstMMO.getBaseAlign();
7587   Align SrcAlign = SrcMMO.getBaseAlign();
7588 
7589   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7590                            IsVolatile);
7591 }
7592 
7593 LegalizerHelper::LegalizeResult
7594 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
7595                                    uint64_t KnownLen, Align DstAlign,
7596                                    Align SrcAlign, bool IsVolatile) {
7597   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7598   return lowerMemcpy(MI, Dst, Src, KnownLen,
7599                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
7600                      IsVolatile);
7601 }
7602 
7603 LegalizerHelper::LegalizeResult
7604 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
7605                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
7606                              Align SrcAlign, bool IsVolatile) {
7607   auto &MF = *MI.getParent()->getParent();
7608   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7609   auto &DL = MF.getDataLayout();
7610   LLVMContext &C = MF.getFunction().getContext();
7611 
7612   assert(KnownLen != 0 && "Have a zero length memcpy length!");
7613 
7614   bool DstAlignCanChange = false;
7615   MachineFrameInfo &MFI = MF.getFrameInfo();
7616   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7617 
7618   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7619   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7620     DstAlignCanChange = true;
7621 
7622   // FIXME: infer better src pointer alignment like SelectionDAG does here.
7623   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
7624   // if the memcpy is in a tail call position.
7625 
7626   std::vector<LLT> MemOps;
7627 
7628   const auto &DstMMO = **MI.memoperands_begin();
7629   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7630   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7631   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7632 
7633   if (!findGISelOptimalMemOpLowering(
7634           MemOps, Limit,
7635           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7636                       IsVolatile),
7637           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7638           MF.getFunction().getAttributes(), TLI))
7639     return UnableToLegalize;
7640 
7641   if (DstAlignCanChange) {
7642     // Get an estimate of the type from the LLT.
7643     Type *IRTy = getTypeForLLT(MemOps[0], C);
7644     Align NewAlign = DL.getABITypeAlign(IRTy);
7645 
7646     // Don't promote to an alignment that would require dynamic stack
7647     // realignment.
7648     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7649     if (!TRI->hasStackRealignment(MF))
7650       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7651         NewAlign = NewAlign / 2;
7652 
7653     if (NewAlign > Alignment) {
7654       Alignment = NewAlign;
7655       unsigned FI = FIDef->getOperand(1).getIndex();
7656       // Give the stack frame object a larger alignment if needed.
7657       if (MFI.getObjectAlign(FI) < Alignment)
7658         MFI.setObjectAlignment(FI, Alignment);
7659     }
7660   }
7661 
7662   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
7663 
7664   MachineIRBuilder MIB(MI);
7665   // Now we need to emit a pair of load and stores for each of the types we've
7666   // collected. I.e. for each type, generate a load from the source pointer of
7667   // that type width, and then generate a corresponding store to the dest buffer
7668   // of that value loaded. This can result in a sequence of loads and stores
7669   // mixed types, depending on what the target specifies as good types to use.
7670   unsigned CurrOffset = 0;
7671   unsigned Size = KnownLen;
7672   for (auto CopyTy : MemOps) {
7673     // Issuing an unaligned load / store pair  that overlaps with the previous
7674     // pair. Adjust the offset accordingly.
7675     if (CopyTy.getSizeInBytes() > Size)
7676       CurrOffset -= CopyTy.getSizeInBytes() - Size;
7677 
7678     // Construct MMOs for the accesses.
7679     auto *LoadMMO =
7680         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7681     auto *StoreMMO =
7682         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7683 
7684     // Create the load.
7685     Register LoadPtr = Src;
7686     Register Offset;
7687     if (CurrOffset != 0) {
7688       LLT SrcTy = MRI.getType(Src);
7689       Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
7690                    .getReg(0);
7691       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7692     }
7693     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
7694 
7695     // Create the store.
7696     Register StorePtr = Dst;
7697     if (CurrOffset != 0) {
7698       LLT DstTy = MRI.getType(Dst);
7699       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7700     }
7701     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
7702     CurrOffset += CopyTy.getSizeInBytes();
7703     Size -= CopyTy.getSizeInBytes();
7704   }
7705 
7706   MI.eraseFromParent();
7707   return Legalized;
7708 }
7709 
7710 LegalizerHelper::LegalizeResult
7711 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
7712                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
7713                               bool IsVolatile) {
7714   auto &MF = *MI.getParent()->getParent();
7715   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7716   auto &DL = MF.getDataLayout();
7717   LLVMContext &C = MF.getFunction().getContext();
7718 
7719   assert(KnownLen != 0 && "Have a zero length memmove length!");
7720 
7721   bool DstAlignCanChange = false;
7722   MachineFrameInfo &MFI = MF.getFrameInfo();
7723   bool OptSize = shouldLowerMemFuncForSize(MF);
7724   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7725 
7726   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7727   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7728     DstAlignCanChange = true;
7729 
7730   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
7731   std::vector<LLT> MemOps;
7732 
7733   const auto &DstMMO = **MI.memoperands_begin();
7734   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7735   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7736   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7737 
7738   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
7739   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
7740   // same thing here.
7741   if (!findGISelOptimalMemOpLowering(
7742           MemOps, Limit,
7743           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7744                       /*IsVolatile*/ true),
7745           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7746           MF.getFunction().getAttributes(), TLI))
7747     return UnableToLegalize;
7748 
7749   if (DstAlignCanChange) {
7750     // Get an estimate of the type from the LLT.
7751     Type *IRTy = getTypeForLLT(MemOps[0], C);
7752     Align NewAlign = DL.getABITypeAlign(IRTy);
7753 
7754     // Don't promote to an alignment that would require dynamic stack
7755     // realignment.
7756     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7757     if (!TRI->hasStackRealignment(MF))
7758       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7759         NewAlign = NewAlign / 2;
7760 
7761     if (NewAlign > Alignment) {
7762       Alignment = NewAlign;
7763       unsigned FI = FIDef->getOperand(1).getIndex();
7764       // Give the stack frame object a larger alignment if needed.
7765       if (MFI.getObjectAlign(FI) < Alignment)
7766         MFI.setObjectAlignment(FI, Alignment);
7767     }
7768   }
7769 
7770   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
7771 
7772   MachineIRBuilder MIB(MI);
7773   // Memmove requires that we perform the loads first before issuing the stores.
7774   // Apart from that, this loop is pretty much doing the same thing as the
7775   // memcpy codegen function.
7776   unsigned CurrOffset = 0;
7777   SmallVector<Register, 16> LoadVals;
7778   for (auto CopyTy : MemOps) {
7779     // Construct MMO for the load.
7780     auto *LoadMMO =
7781         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7782 
7783     // Create the load.
7784     Register LoadPtr = Src;
7785     if (CurrOffset != 0) {
7786       LLT SrcTy = MRI.getType(Src);
7787       auto Offset =
7788           MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
7789       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7790     }
7791     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
7792     CurrOffset += CopyTy.getSizeInBytes();
7793   }
7794 
7795   CurrOffset = 0;
7796   for (unsigned I = 0; I < MemOps.size(); ++I) {
7797     LLT CopyTy = MemOps[I];
7798     // Now store the values loaded.
7799     auto *StoreMMO =
7800         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7801 
7802     Register StorePtr = Dst;
7803     if (CurrOffset != 0) {
7804       LLT DstTy = MRI.getType(Dst);
7805       auto Offset =
7806           MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
7807       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7808     }
7809     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
7810     CurrOffset += CopyTy.getSizeInBytes();
7811   }
7812   MI.eraseFromParent();
7813   return Legalized;
7814 }
7815 
7816 LegalizerHelper::LegalizeResult
7817 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
7818   const unsigned Opc = MI.getOpcode();
7819   // This combine is fairly complex so it's not written with a separate
7820   // matcher function.
7821   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
7822           Opc == TargetOpcode::G_MEMSET) &&
7823          "Expected memcpy like instruction");
7824 
7825   auto MMOIt = MI.memoperands_begin();
7826   const MachineMemOperand *MemOp = *MMOIt;
7827 
7828   Align DstAlign = MemOp->getBaseAlign();
7829   Align SrcAlign;
7830   Register Dst = MI.getOperand(0).getReg();
7831   Register Src = MI.getOperand(1).getReg();
7832   Register Len = MI.getOperand(2).getReg();
7833 
7834   if (Opc != TargetOpcode::G_MEMSET) {
7835     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
7836     MemOp = *(++MMOIt);
7837     SrcAlign = MemOp->getBaseAlign();
7838   }
7839 
7840   // See if this is a constant length copy
7841   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7842   if (!LenVRegAndVal)
7843     return UnableToLegalize;
7844   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7845 
7846   if (KnownLen == 0) {
7847     MI.eraseFromParent();
7848     return Legalized;
7849   }
7850 
7851   bool IsVolatile = MemOp->isVolatile();
7852   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
7853     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7854                              IsVolatile);
7855 
7856   // Don't try to optimize volatile.
7857   if (IsVolatile)
7858     return UnableToLegalize;
7859 
7860   if (MaxLen && KnownLen > MaxLen)
7861     return UnableToLegalize;
7862 
7863   if (Opc == TargetOpcode::G_MEMCPY) {
7864     auto &MF = *MI.getParent()->getParent();
7865     const auto &TLI = *MF.getSubtarget().getTargetLowering();
7866     bool OptSize = shouldLowerMemFuncForSize(MF);
7867     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
7868     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
7869                        IsVolatile);
7870   }
7871   if (Opc == TargetOpcode::G_MEMMOVE)
7872     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
7873   if (Opc == TargetOpcode::G_MEMSET)
7874     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
7875   return UnableToLegalize;
7876 }
7877