1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
20 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/Utils.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include "llvm/CodeGen/TargetInstrInfo.h"
29 #include "llvm/CodeGen/TargetLowering.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/TargetSubtargetInfo.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/MathExtras.h"
35 #include "llvm/Support/raw_ostream.h"
36 #include "llvm/Target/TargetMachine.h"
37 
38 #define DEBUG_TYPE "legalizer"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace MIPatternMatch;
43 
44 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
45 ///
46 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
47 /// with any leftover piece as type \p LeftoverTy
48 ///
49 /// Returns -1 in the first element of the pair if the breakdown is not
50 /// satisfiable.
51 static std::pair<int, int>
52 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
53   assert(!LeftoverTy.isValid() && "this is an out argument");
54 
55   unsigned Size = OrigTy.getSizeInBits();
56   unsigned NarrowSize = NarrowTy.getSizeInBits();
57   unsigned NumParts = Size / NarrowSize;
58   unsigned LeftoverSize = Size - NumParts * NarrowSize;
59   assert(Size > NarrowSize);
60 
61   if (LeftoverSize == 0)
62     return {NumParts, 0};
63 
64   if (NarrowTy.isVector()) {
65     unsigned EltSize = OrigTy.getScalarSizeInBits();
66     if (LeftoverSize % EltSize != 0)
67       return {-1, -1};
68     LeftoverTy = LLT::scalarOrVector(
69         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
70   } else {
71     LeftoverTy = LLT::scalar(LeftoverSize);
72   }
73 
74   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
75   return std::make_pair(NumParts, NumLeftover);
76 }
77 
78 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
79 
80   if (!Ty.isScalar())
81     return nullptr;
82 
83   switch (Ty.getSizeInBits()) {
84   case 16:
85     return Type::getHalfTy(Ctx);
86   case 32:
87     return Type::getFloatTy(Ctx);
88   case 64:
89     return Type::getDoubleTy(Ctx);
90   case 80:
91     return Type::getX86_FP80Ty(Ctx);
92   case 128:
93     return Type::getFP128Ty(Ctx);
94   default:
95     return nullptr;
96   }
97 }
98 
99 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
100                                  GISelChangeObserver &Observer,
101                                  MachineIRBuilder &Builder)
102     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
103       LI(*MF.getSubtarget().getLegalizerInfo()),
104       TLI(*MF.getSubtarget().getTargetLowering()) { }
105 
106 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
107                                  GISelChangeObserver &Observer,
108                                  MachineIRBuilder &B)
109   : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
110     TLI(*MF.getSubtarget().getTargetLowering()) { }
111 
112 LegalizerHelper::LegalizeResult
113 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
114                                    LostDebugLocObserver &LocObserver) {
115   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
116 
117   MIRBuilder.setInstrAndDebugLoc(MI);
118 
119   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
120       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
121     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
122   auto Step = LI.getAction(MI, MRI);
123   switch (Step.Action) {
124   case Legal:
125     LLVM_DEBUG(dbgs() << ".. Already legal\n");
126     return AlreadyLegal;
127   case Libcall:
128     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
129     return libcall(MI, LocObserver);
130   case NarrowScalar:
131     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
132     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
133   case WidenScalar:
134     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
135     return widenScalar(MI, Step.TypeIdx, Step.NewType);
136   case Bitcast:
137     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
138     return bitcast(MI, Step.TypeIdx, Step.NewType);
139   case Lower:
140     LLVM_DEBUG(dbgs() << ".. Lower\n");
141     return lower(MI, Step.TypeIdx, Step.NewType);
142   case FewerElements:
143     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
144     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
145   case MoreElements:
146     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
147     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
148   case Custom:
149     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
150     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
151   default:
152     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
153     return UnableToLegalize;
154   }
155 }
156 
157 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
158                                    SmallVectorImpl<Register> &VRegs) {
159   for (int i = 0; i < NumParts; ++i)
160     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
161   MIRBuilder.buildUnmerge(VRegs, Reg);
162 }
163 
164 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
165                                    LLT MainTy, LLT &LeftoverTy,
166                                    SmallVectorImpl<Register> &VRegs,
167                                    SmallVectorImpl<Register> &LeftoverRegs) {
168   assert(!LeftoverTy.isValid() && "this is an out argument");
169 
170   unsigned RegSize = RegTy.getSizeInBits();
171   unsigned MainSize = MainTy.getSizeInBits();
172   unsigned NumParts = RegSize / MainSize;
173   unsigned LeftoverSize = RegSize - NumParts * MainSize;
174 
175   // Use an unmerge when possible.
176   if (LeftoverSize == 0) {
177     for (unsigned I = 0; I < NumParts; ++I)
178       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
179     MIRBuilder.buildUnmerge(VRegs, Reg);
180     return true;
181   }
182 
183   // Perform irregular split. Leftover is last element of RegPieces.
184   if (MainTy.isVector()) {
185     SmallVector<Register, 8> RegPieces;
186     extractVectorParts(Reg, MainTy.getNumElements(), RegPieces);
187     for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
188       VRegs.push_back(RegPieces[i]);
189     LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
190     LeftoverTy = MRI.getType(LeftoverRegs[0]);
191     return true;
192   }
193 
194   LeftoverTy = LLT::scalar(LeftoverSize);
195   // For irregular sizes, extract the individual parts.
196   for (unsigned I = 0; I != NumParts; ++I) {
197     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
198     VRegs.push_back(NewReg);
199     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
200   }
201 
202   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
203        Offset += LeftoverSize) {
204     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
205     LeftoverRegs.push_back(NewReg);
206     MIRBuilder.buildExtract(NewReg, Reg, Offset);
207   }
208 
209   return true;
210 }
211 
212 void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts,
213                                          SmallVectorImpl<Register> &VRegs) {
214   LLT RegTy = MRI.getType(Reg);
215   assert(RegTy.isVector() && "Expected a vector type");
216 
217   LLT EltTy = RegTy.getElementType();
218   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
219   unsigned RegNumElts = RegTy.getNumElements();
220   unsigned LeftoverNumElts = RegNumElts % NumElts;
221   unsigned NumNarrowTyPieces = RegNumElts / NumElts;
222 
223   // Perfect split without leftover
224   if (LeftoverNumElts == 0)
225     return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs);
226 
227   // Irregular split. Provide direct access to all elements for artifact
228   // combiner using unmerge to elements. Then build vectors with NumElts
229   // elements. Remaining element(s) will be (used to build vector) Leftover.
230   SmallVector<Register, 8> Elts;
231   extractParts(Reg, EltTy, RegNumElts, Elts);
232 
233   unsigned Offset = 0;
234   // Requested sub-vectors of NarrowTy.
235   for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
236     ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
237     VRegs.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
238   }
239 
240   // Leftover element(s).
241   if (LeftoverNumElts == 1) {
242     VRegs.push_back(Elts[Offset]);
243   } else {
244     LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
245     ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
246     VRegs.push_back(MIRBuilder.buildMerge(LeftoverTy, Pieces).getReg(0));
247   }
248 }
249 
250 void LegalizerHelper::insertParts(Register DstReg,
251                                   LLT ResultTy, LLT PartTy,
252                                   ArrayRef<Register> PartRegs,
253                                   LLT LeftoverTy,
254                                   ArrayRef<Register> LeftoverRegs) {
255   if (!LeftoverTy.isValid()) {
256     assert(LeftoverRegs.empty());
257 
258     if (!ResultTy.isVector()) {
259       MIRBuilder.buildMerge(DstReg, PartRegs);
260       return;
261     }
262 
263     if (PartTy.isVector())
264       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
265     else
266       MIRBuilder.buildBuildVector(DstReg, PartRegs);
267     return;
268   }
269 
270   // Merge sub-vectors with different number of elements and insert into DstReg.
271   if (ResultTy.isVector()) {
272     assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
273     SmallVector<Register, 8> AllRegs;
274     for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
275       AllRegs.push_back(Reg);
276     return mergeMixedSubvectors(DstReg, AllRegs);
277   }
278 
279   SmallVector<Register> GCDRegs;
280   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
281   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
282     extractGCDType(GCDRegs, GCDTy, PartReg);
283   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
284   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
285 }
286 
287 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
288                                        Register Reg) {
289   LLT Ty = MRI.getType(Reg);
290   SmallVector<Register, 8> RegElts;
291   extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts);
292   Elts.append(RegElts);
293 }
294 
295 /// Merge \p PartRegs with different types into \p DstReg.
296 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
297                                            ArrayRef<Register> PartRegs) {
298   SmallVector<Register, 8> AllElts;
299   for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
300     appendVectorElts(AllElts, PartRegs[i]);
301 
302   Register Leftover = PartRegs[PartRegs.size() - 1];
303   if (MRI.getType(Leftover).isScalar())
304     AllElts.push_back(Leftover);
305   else
306     appendVectorElts(AllElts, Leftover);
307 
308   MIRBuilder.buildMerge(DstReg, AllElts);
309 }
310 
311 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
312 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
313                               const MachineInstr &MI) {
314   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
315 
316   const int StartIdx = Regs.size();
317   const int NumResults = MI.getNumOperands() - 1;
318   Regs.resize(Regs.size() + NumResults);
319   for (int I = 0; I != NumResults; ++I)
320     Regs[StartIdx + I] = MI.getOperand(I).getReg();
321 }
322 
323 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
324                                      LLT GCDTy, Register SrcReg) {
325   LLT SrcTy = MRI.getType(SrcReg);
326   if (SrcTy == GCDTy) {
327     // If the source already evenly divides the result type, we don't need to do
328     // anything.
329     Parts.push_back(SrcReg);
330   } else {
331     // Need to split into common type sized pieces.
332     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
333     getUnmergeResults(Parts, *Unmerge);
334   }
335 }
336 
337 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
338                                     LLT NarrowTy, Register SrcReg) {
339   LLT SrcTy = MRI.getType(SrcReg);
340   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
341   extractGCDType(Parts, GCDTy, SrcReg);
342   return GCDTy;
343 }
344 
345 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
346                                          SmallVectorImpl<Register> &VRegs,
347                                          unsigned PadStrategy) {
348   LLT LCMTy = getLCMType(DstTy, NarrowTy);
349 
350   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
351   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
352   int NumOrigSrc = VRegs.size();
353 
354   Register PadReg;
355 
356   // Get a value we can use to pad the source value if the sources won't evenly
357   // cover the result type.
358   if (NumOrigSrc < NumParts * NumSubParts) {
359     if (PadStrategy == TargetOpcode::G_ZEXT)
360       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
361     else if (PadStrategy == TargetOpcode::G_ANYEXT)
362       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
363     else {
364       assert(PadStrategy == TargetOpcode::G_SEXT);
365 
366       // Shift the sign bit of the low register through the high register.
367       auto ShiftAmt =
368         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
369       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
370     }
371   }
372 
373   // Registers for the final merge to be produced.
374   SmallVector<Register, 4> Remerge(NumParts);
375 
376   // Registers needed for intermediate merges, which will be merged into a
377   // source for Remerge.
378   SmallVector<Register, 4> SubMerge(NumSubParts);
379 
380   // Once we've fully read off the end of the original source bits, we can reuse
381   // the same high bits for remaining padding elements.
382   Register AllPadReg;
383 
384   // Build merges to the LCM type to cover the original result type.
385   for (int I = 0; I != NumParts; ++I) {
386     bool AllMergePartsArePadding = true;
387 
388     // Build the requested merges to the requested type.
389     for (int J = 0; J != NumSubParts; ++J) {
390       int Idx = I * NumSubParts + J;
391       if (Idx >= NumOrigSrc) {
392         SubMerge[J] = PadReg;
393         continue;
394       }
395 
396       SubMerge[J] = VRegs[Idx];
397 
398       // There are meaningful bits here we can't reuse later.
399       AllMergePartsArePadding = false;
400     }
401 
402     // If we've filled up a complete piece with padding bits, we can directly
403     // emit the natural sized constant if applicable, rather than a merge of
404     // smaller constants.
405     if (AllMergePartsArePadding && !AllPadReg) {
406       if (PadStrategy == TargetOpcode::G_ANYEXT)
407         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
408       else if (PadStrategy == TargetOpcode::G_ZEXT)
409         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
410 
411       // If this is a sign extension, we can't materialize a trivial constant
412       // with the right type and have to produce a merge.
413     }
414 
415     if (AllPadReg) {
416       // Avoid creating additional instructions if we're just adding additional
417       // copies of padding bits.
418       Remerge[I] = AllPadReg;
419       continue;
420     }
421 
422     if (NumSubParts == 1)
423       Remerge[I] = SubMerge[0];
424     else
425       Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
426 
427     // In the sign extend padding case, re-use the first all-signbit merge.
428     if (AllMergePartsArePadding && !AllPadReg)
429       AllPadReg = Remerge[I];
430   }
431 
432   VRegs = std::move(Remerge);
433   return LCMTy;
434 }
435 
436 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
437                                                ArrayRef<Register> RemergeRegs) {
438   LLT DstTy = MRI.getType(DstReg);
439 
440   // Create the merge to the widened source, and extract the relevant bits into
441   // the result.
442 
443   if (DstTy == LCMTy) {
444     MIRBuilder.buildMerge(DstReg, RemergeRegs);
445     return;
446   }
447 
448   auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
449   if (DstTy.isScalar() && LCMTy.isScalar()) {
450     MIRBuilder.buildTrunc(DstReg, Remerge);
451     return;
452   }
453 
454   if (LCMTy.isVector()) {
455     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
456     SmallVector<Register, 8> UnmergeDefs(NumDefs);
457     UnmergeDefs[0] = DstReg;
458     for (unsigned I = 1; I != NumDefs; ++I)
459       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
460 
461     MIRBuilder.buildUnmerge(UnmergeDefs,
462                             MIRBuilder.buildMerge(LCMTy, RemergeRegs));
463     return;
464   }
465 
466   llvm_unreachable("unhandled case");
467 }
468 
469 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
470 #define RTLIBCASE_INT(LibcallPrefix)                                           \
471   do {                                                                         \
472     switch (Size) {                                                            \
473     case 32:                                                                   \
474       return RTLIB::LibcallPrefix##32;                                         \
475     case 64:                                                                   \
476       return RTLIB::LibcallPrefix##64;                                         \
477     case 128:                                                                  \
478       return RTLIB::LibcallPrefix##128;                                        \
479     default:                                                                   \
480       llvm_unreachable("unexpected size");                                     \
481     }                                                                          \
482   } while (0)
483 
484 #define RTLIBCASE(LibcallPrefix)                                               \
485   do {                                                                         \
486     switch (Size) {                                                            \
487     case 32:                                                                   \
488       return RTLIB::LibcallPrefix##32;                                         \
489     case 64:                                                                   \
490       return RTLIB::LibcallPrefix##64;                                         \
491     case 80:                                                                   \
492       return RTLIB::LibcallPrefix##80;                                         \
493     case 128:                                                                  \
494       return RTLIB::LibcallPrefix##128;                                        \
495     default:                                                                   \
496       llvm_unreachable("unexpected size");                                     \
497     }                                                                          \
498   } while (0)
499 
500   switch (Opcode) {
501   case TargetOpcode::G_SDIV:
502     RTLIBCASE_INT(SDIV_I);
503   case TargetOpcode::G_UDIV:
504     RTLIBCASE_INT(UDIV_I);
505   case TargetOpcode::G_SREM:
506     RTLIBCASE_INT(SREM_I);
507   case TargetOpcode::G_UREM:
508     RTLIBCASE_INT(UREM_I);
509   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
510     RTLIBCASE_INT(CTLZ_I);
511   case TargetOpcode::G_FADD:
512     RTLIBCASE(ADD_F);
513   case TargetOpcode::G_FSUB:
514     RTLIBCASE(SUB_F);
515   case TargetOpcode::G_FMUL:
516     RTLIBCASE(MUL_F);
517   case TargetOpcode::G_FDIV:
518     RTLIBCASE(DIV_F);
519   case TargetOpcode::G_FEXP:
520     RTLIBCASE(EXP_F);
521   case TargetOpcode::G_FEXP2:
522     RTLIBCASE(EXP2_F);
523   case TargetOpcode::G_FREM:
524     RTLIBCASE(REM_F);
525   case TargetOpcode::G_FPOW:
526     RTLIBCASE(POW_F);
527   case TargetOpcode::G_FMA:
528     RTLIBCASE(FMA_F);
529   case TargetOpcode::G_FSIN:
530     RTLIBCASE(SIN_F);
531   case TargetOpcode::G_FCOS:
532     RTLIBCASE(COS_F);
533   case TargetOpcode::G_FLOG10:
534     RTLIBCASE(LOG10_F);
535   case TargetOpcode::G_FLOG:
536     RTLIBCASE(LOG_F);
537   case TargetOpcode::G_FLOG2:
538     RTLIBCASE(LOG2_F);
539   case TargetOpcode::G_FCEIL:
540     RTLIBCASE(CEIL_F);
541   case TargetOpcode::G_FFLOOR:
542     RTLIBCASE(FLOOR_F);
543   case TargetOpcode::G_FMINNUM:
544     RTLIBCASE(FMIN_F);
545   case TargetOpcode::G_FMAXNUM:
546     RTLIBCASE(FMAX_F);
547   case TargetOpcode::G_FSQRT:
548     RTLIBCASE(SQRT_F);
549   case TargetOpcode::G_FRINT:
550     RTLIBCASE(RINT_F);
551   case TargetOpcode::G_FNEARBYINT:
552     RTLIBCASE(NEARBYINT_F);
553   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
554     RTLIBCASE(ROUNDEVEN_F);
555   }
556   llvm_unreachable("Unknown libcall function");
557 }
558 
559 /// True if an instruction is in tail position in its caller. Intended for
560 /// legalizing libcalls as tail calls when possible.
561 static bool isLibCallInTailPosition(MachineInstr &MI,
562                                     const TargetInstrInfo &TII,
563                                     MachineRegisterInfo &MRI) {
564   MachineBasicBlock &MBB = *MI.getParent();
565   const Function &F = MBB.getParent()->getFunction();
566 
567   // Conservatively require the attributes of the call to match those of
568   // the return. Ignore NoAlias and NonNull because they don't affect the
569   // call sequence.
570   AttributeList CallerAttrs = F.getAttributes();
571   if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
572           .removeAttribute(Attribute::NoAlias)
573           .removeAttribute(Attribute::NonNull)
574           .hasAttributes())
575     return false;
576 
577   // It's not safe to eliminate the sign / zero extension of the return value.
578   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
579       CallerAttrs.hasRetAttr(Attribute::SExt))
580     return false;
581 
582   // Only tail call if the following instruction is a standard return or if we
583   // have a `thisreturn` callee, and a sequence like:
584   //
585   //   G_MEMCPY %0, %1, %2
586   //   $x0 = COPY %0
587   //   RET_ReallyLR implicit $x0
588   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
589   if (Next != MBB.instr_end() && Next->isCopy()) {
590     switch (MI.getOpcode()) {
591     default:
592       llvm_unreachable("unsupported opcode");
593     case TargetOpcode::G_BZERO:
594       return false;
595     case TargetOpcode::G_MEMCPY:
596     case TargetOpcode::G_MEMMOVE:
597     case TargetOpcode::G_MEMSET:
598       break;
599     }
600 
601     Register VReg = MI.getOperand(0).getReg();
602     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
603       return false;
604 
605     Register PReg = Next->getOperand(0).getReg();
606     if (!PReg.isPhysical())
607       return false;
608 
609     auto Ret = next_nodbg(Next, MBB.instr_end());
610     if (Ret == MBB.instr_end() || !Ret->isReturn())
611       return false;
612 
613     if (Ret->getNumImplicitOperands() != 1)
614       return false;
615 
616     if (PReg != Ret->getOperand(0).getReg())
617       return false;
618 
619     // Skip over the COPY that we just validated.
620     Next = Ret;
621   }
622 
623   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
624     return false;
625 
626   return true;
627 }
628 
629 LegalizerHelper::LegalizeResult
630 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
631                     const CallLowering::ArgInfo &Result,
632                     ArrayRef<CallLowering::ArgInfo> Args,
633                     const CallingConv::ID CC) {
634   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
635 
636   CallLowering::CallLoweringInfo Info;
637   Info.CallConv = CC;
638   Info.Callee = MachineOperand::CreateES(Name);
639   Info.OrigRet = Result;
640   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
641   if (!CLI.lowerCall(MIRBuilder, Info))
642     return LegalizerHelper::UnableToLegalize;
643 
644   return LegalizerHelper::Legalized;
645 }
646 
647 LegalizerHelper::LegalizeResult
648 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
649                     const CallLowering::ArgInfo &Result,
650                     ArrayRef<CallLowering::ArgInfo> Args) {
651   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
652   const char *Name = TLI.getLibcallName(Libcall);
653   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
654   return createLibcall(MIRBuilder, Name, Result, Args, CC);
655 }
656 
657 // Useful for libcalls where all operands have the same type.
658 static LegalizerHelper::LegalizeResult
659 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
660               Type *OpType) {
661   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
662 
663   // FIXME: What does the original arg index mean here?
664   SmallVector<CallLowering::ArgInfo, 3> Args;
665   for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
666     Args.push_back({MO.getReg(), OpType, 0});
667   return createLibcall(MIRBuilder, Libcall,
668                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
669 }
670 
671 LegalizerHelper::LegalizeResult
672 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
673                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
674   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
675 
676   SmallVector<CallLowering::ArgInfo, 3> Args;
677   // Add all the args, except for the last which is an imm denoting 'tail'.
678   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
679     Register Reg = MI.getOperand(i).getReg();
680 
681     // Need derive an IR type for call lowering.
682     LLT OpLLT = MRI.getType(Reg);
683     Type *OpTy = nullptr;
684     if (OpLLT.isPointer())
685       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
686     else
687       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
688     Args.push_back({Reg, OpTy, 0});
689   }
690 
691   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
692   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
693   RTLIB::Libcall RTLibcall;
694   unsigned Opc = MI.getOpcode();
695   switch (Opc) {
696   case TargetOpcode::G_BZERO:
697     RTLibcall = RTLIB::BZERO;
698     break;
699   case TargetOpcode::G_MEMCPY:
700     RTLibcall = RTLIB::MEMCPY;
701     Args[0].Flags[0].setReturned();
702     break;
703   case TargetOpcode::G_MEMMOVE:
704     RTLibcall = RTLIB::MEMMOVE;
705     Args[0].Flags[0].setReturned();
706     break;
707   case TargetOpcode::G_MEMSET:
708     RTLibcall = RTLIB::MEMSET;
709     Args[0].Flags[0].setReturned();
710     break;
711   default:
712     llvm_unreachable("unsupported opcode");
713   }
714   const char *Name = TLI.getLibcallName(RTLibcall);
715 
716   // Unsupported libcall on the target.
717   if (!Name) {
718     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
719                       << MIRBuilder.getTII().getName(Opc) << "\n");
720     return LegalizerHelper::UnableToLegalize;
721   }
722 
723   CallLowering::CallLoweringInfo Info;
724   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
725   Info.Callee = MachineOperand::CreateES(Name);
726   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
727   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
728                     isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
729 
730   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
731   if (!CLI.lowerCall(MIRBuilder, Info))
732     return LegalizerHelper::UnableToLegalize;
733 
734   if (Info.LoweredTailCall) {
735     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
736 
737     // Check debug locations before removing the return.
738     LocObserver.checkpoint(true);
739 
740     // We must have a return following the call (or debug insts) to get past
741     // isLibCallInTailPosition.
742     do {
743       MachineInstr *Next = MI.getNextNode();
744       assert(Next &&
745              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
746              "Expected instr following MI to be return or debug inst?");
747       // We lowered a tail call, so the call is now the return from the block.
748       // Delete the old return.
749       Next->eraseFromParent();
750     } while (MI.getNextNode());
751 
752     // We expect to lose the debug location from the return.
753     LocObserver.checkpoint(false);
754   }
755 
756   return LegalizerHelper::Legalized;
757 }
758 
759 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
760                                        Type *FromType) {
761   auto ToMVT = MVT::getVT(ToType);
762   auto FromMVT = MVT::getVT(FromType);
763 
764   switch (Opcode) {
765   case TargetOpcode::G_FPEXT:
766     return RTLIB::getFPEXT(FromMVT, ToMVT);
767   case TargetOpcode::G_FPTRUNC:
768     return RTLIB::getFPROUND(FromMVT, ToMVT);
769   case TargetOpcode::G_FPTOSI:
770     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
771   case TargetOpcode::G_FPTOUI:
772     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
773   case TargetOpcode::G_SITOFP:
774     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
775   case TargetOpcode::G_UITOFP:
776     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
777   }
778   llvm_unreachable("Unsupported libcall function");
779 }
780 
781 static LegalizerHelper::LegalizeResult
782 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
783                   Type *FromType) {
784   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
785   return createLibcall(MIRBuilder, Libcall,
786                        {MI.getOperand(0).getReg(), ToType, 0},
787                        {{MI.getOperand(1).getReg(), FromType, 0}});
788 }
789 
790 LegalizerHelper::LegalizeResult
791 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
792   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
793   unsigned Size = LLTy.getSizeInBits();
794   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
795 
796   switch (MI.getOpcode()) {
797   default:
798     return UnableToLegalize;
799   case TargetOpcode::G_SDIV:
800   case TargetOpcode::G_UDIV:
801   case TargetOpcode::G_SREM:
802   case TargetOpcode::G_UREM:
803   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
804     Type *HLTy = IntegerType::get(Ctx, Size);
805     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
806     if (Status != Legalized)
807       return Status;
808     break;
809   }
810   case TargetOpcode::G_FADD:
811   case TargetOpcode::G_FSUB:
812   case TargetOpcode::G_FMUL:
813   case TargetOpcode::G_FDIV:
814   case TargetOpcode::G_FMA:
815   case TargetOpcode::G_FPOW:
816   case TargetOpcode::G_FREM:
817   case TargetOpcode::G_FCOS:
818   case TargetOpcode::G_FSIN:
819   case TargetOpcode::G_FLOG10:
820   case TargetOpcode::G_FLOG:
821   case TargetOpcode::G_FLOG2:
822   case TargetOpcode::G_FEXP:
823   case TargetOpcode::G_FEXP2:
824   case TargetOpcode::G_FCEIL:
825   case TargetOpcode::G_FFLOOR:
826   case TargetOpcode::G_FMINNUM:
827   case TargetOpcode::G_FMAXNUM:
828   case TargetOpcode::G_FSQRT:
829   case TargetOpcode::G_FRINT:
830   case TargetOpcode::G_FNEARBYINT:
831   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
832     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
833     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
834       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
835       return UnableToLegalize;
836     }
837     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
838     if (Status != Legalized)
839       return Status;
840     break;
841   }
842   case TargetOpcode::G_FPEXT:
843   case TargetOpcode::G_FPTRUNC: {
844     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
845     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
846     if (!FromTy || !ToTy)
847       return UnableToLegalize;
848     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
849     if (Status != Legalized)
850       return Status;
851     break;
852   }
853   case TargetOpcode::G_FPTOSI:
854   case TargetOpcode::G_FPTOUI: {
855     // FIXME: Support other types
856     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
857     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
858     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
859       return UnableToLegalize;
860     LegalizeResult Status = conversionLibcall(
861         MI, MIRBuilder,
862         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
863         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
864     if (Status != Legalized)
865       return Status;
866     break;
867   }
868   case TargetOpcode::G_SITOFP:
869   case TargetOpcode::G_UITOFP: {
870     // FIXME: Support other types
871     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
872     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
873     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
874       return UnableToLegalize;
875     LegalizeResult Status = conversionLibcall(
876         MI, MIRBuilder,
877         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
878         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
879     if (Status != Legalized)
880       return Status;
881     break;
882   }
883   case TargetOpcode::G_BZERO:
884   case TargetOpcode::G_MEMCPY:
885   case TargetOpcode::G_MEMMOVE:
886   case TargetOpcode::G_MEMSET: {
887     LegalizeResult Result =
888         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
889     if (Result != Legalized)
890       return Result;
891     MI.eraseFromParent();
892     return Result;
893   }
894   }
895 
896   MI.eraseFromParent();
897   return Legalized;
898 }
899 
900 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
901                                                               unsigned TypeIdx,
902                                                               LLT NarrowTy) {
903   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
904   uint64_t NarrowSize = NarrowTy.getSizeInBits();
905 
906   switch (MI.getOpcode()) {
907   default:
908     return UnableToLegalize;
909   case TargetOpcode::G_IMPLICIT_DEF: {
910     Register DstReg = MI.getOperand(0).getReg();
911     LLT DstTy = MRI.getType(DstReg);
912 
913     // If SizeOp0 is not an exact multiple of NarrowSize, emit
914     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
915     // FIXME: Although this would also be legal for the general case, it causes
916     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
917     //  combines not being hit). This seems to be a problem related to the
918     //  artifact combiner.
919     if (SizeOp0 % NarrowSize != 0) {
920       LLT ImplicitTy = NarrowTy;
921       if (DstTy.isVector())
922         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
923 
924       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
925       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
926 
927       MI.eraseFromParent();
928       return Legalized;
929     }
930 
931     int NumParts = SizeOp0 / NarrowSize;
932 
933     SmallVector<Register, 2> DstRegs;
934     for (int i = 0; i < NumParts; ++i)
935       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
936 
937     if (DstTy.isVector())
938       MIRBuilder.buildBuildVector(DstReg, DstRegs);
939     else
940       MIRBuilder.buildMerge(DstReg, DstRegs);
941     MI.eraseFromParent();
942     return Legalized;
943   }
944   case TargetOpcode::G_CONSTANT: {
945     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
946     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
947     unsigned TotalSize = Ty.getSizeInBits();
948     unsigned NarrowSize = NarrowTy.getSizeInBits();
949     int NumParts = TotalSize / NarrowSize;
950 
951     SmallVector<Register, 4> PartRegs;
952     for (int I = 0; I != NumParts; ++I) {
953       unsigned Offset = I * NarrowSize;
954       auto K = MIRBuilder.buildConstant(NarrowTy,
955                                         Val.lshr(Offset).trunc(NarrowSize));
956       PartRegs.push_back(K.getReg(0));
957     }
958 
959     LLT LeftoverTy;
960     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
961     SmallVector<Register, 1> LeftoverRegs;
962     if (LeftoverBits != 0) {
963       LeftoverTy = LLT::scalar(LeftoverBits);
964       auto K = MIRBuilder.buildConstant(
965         LeftoverTy,
966         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
967       LeftoverRegs.push_back(K.getReg(0));
968     }
969 
970     insertParts(MI.getOperand(0).getReg(),
971                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
972 
973     MI.eraseFromParent();
974     return Legalized;
975   }
976   case TargetOpcode::G_SEXT:
977   case TargetOpcode::G_ZEXT:
978   case TargetOpcode::G_ANYEXT:
979     return narrowScalarExt(MI, TypeIdx, NarrowTy);
980   case TargetOpcode::G_TRUNC: {
981     if (TypeIdx != 1)
982       return UnableToLegalize;
983 
984     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
985     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
986       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
987       return UnableToLegalize;
988     }
989 
990     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
991     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
992     MI.eraseFromParent();
993     return Legalized;
994   }
995 
996   case TargetOpcode::G_FREEZE: {
997     if (TypeIdx != 0)
998       return UnableToLegalize;
999 
1000     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1001     // Should widen scalar first
1002     if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1003       return UnableToLegalize;
1004 
1005     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1006     SmallVector<Register, 8> Parts;
1007     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1008       Parts.push_back(
1009           MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
1010     }
1011 
1012     MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Parts);
1013     MI.eraseFromParent();
1014     return Legalized;
1015   }
1016   case TargetOpcode::G_ADD:
1017   case TargetOpcode::G_SUB:
1018   case TargetOpcode::G_SADDO:
1019   case TargetOpcode::G_SSUBO:
1020   case TargetOpcode::G_SADDE:
1021   case TargetOpcode::G_SSUBE:
1022   case TargetOpcode::G_UADDO:
1023   case TargetOpcode::G_USUBO:
1024   case TargetOpcode::G_UADDE:
1025   case TargetOpcode::G_USUBE:
1026     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1027   case TargetOpcode::G_MUL:
1028   case TargetOpcode::G_UMULH:
1029     return narrowScalarMul(MI, NarrowTy);
1030   case TargetOpcode::G_EXTRACT:
1031     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1032   case TargetOpcode::G_INSERT:
1033     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1034   case TargetOpcode::G_LOAD: {
1035     auto &LoadMI = cast<GLoad>(MI);
1036     Register DstReg = LoadMI.getDstReg();
1037     LLT DstTy = MRI.getType(DstReg);
1038     if (DstTy.isVector())
1039       return UnableToLegalize;
1040 
1041     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
1042       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1043       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1044       MIRBuilder.buildAnyExt(DstReg, TmpReg);
1045       LoadMI.eraseFromParent();
1046       return Legalized;
1047     }
1048 
1049     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1050   }
1051   case TargetOpcode::G_ZEXTLOAD:
1052   case TargetOpcode::G_SEXTLOAD: {
1053     auto &LoadMI = cast<GExtLoad>(MI);
1054     Register DstReg = LoadMI.getDstReg();
1055     Register PtrReg = LoadMI.getPointerReg();
1056 
1057     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1058     auto &MMO = LoadMI.getMMO();
1059     unsigned MemSize = MMO.getSizeInBits();
1060 
1061     if (MemSize == NarrowSize) {
1062       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1063     } else if (MemSize < NarrowSize) {
1064       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1065     } else if (MemSize > NarrowSize) {
1066       // FIXME: Need to split the load.
1067       return UnableToLegalize;
1068     }
1069 
1070     if (isa<GZExtLoad>(LoadMI))
1071       MIRBuilder.buildZExt(DstReg, TmpReg);
1072     else
1073       MIRBuilder.buildSExt(DstReg, TmpReg);
1074 
1075     LoadMI.eraseFromParent();
1076     return Legalized;
1077   }
1078   case TargetOpcode::G_STORE: {
1079     auto &StoreMI = cast<GStore>(MI);
1080 
1081     Register SrcReg = StoreMI.getValueReg();
1082     LLT SrcTy = MRI.getType(SrcReg);
1083     if (SrcTy.isVector())
1084       return UnableToLegalize;
1085 
1086     int NumParts = SizeOp0 / NarrowSize;
1087     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1088     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1089     if (SrcTy.isVector() && LeftoverBits != 0)
1090       return UnableToLegalize;
1091 
1092     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
1093       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1094       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1095       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1096       StoreMI.eraseFromParent();
1097       return Legalized;
1098     }
1099 
1100     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1101   }
1102   case TargetOpcode::G_SELECT:
1103     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1104   case TargetOpcode::G_AND:
1105   case TargetOpcode::G_OR:
1106   case TargetOpcode::G_XOR: {
1107     // Legalize bitwise operation:
1108     // A = BinOp<Ty> B, C
1109     // into:
1110     // B1, ..., BN = G_UNMERGE_VALUES B
1111     // C1, ..., CN = G_UNMERGE_VALUES C
1112     // A1 = BinOp<Ty/N> B1, C2
1113     // ...
1114     // AN = BinOp<Ty/N> BN, CN
1115     // A = G_MERGE_VALUES A1, ..., AN
1116     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1117   }
1118   case TargetOpcode::G_SHL:
1119   case TargetOpcode::G_LSHR:
1120   case TargetOpcode::G_ASHR:
1121     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1122   case TargetOpcode::G_CTLZ:
1123   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1124   case TargetOpcode::G_CTTZ:
1125   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1126   case TargetOpcode::G_CTPOP:
1127     if (TypeIdx == 1)
1128       switch (MI.getOpcode()) {
1129       case TargetOpcode::G_CTLZ:
1130       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1131         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1132       case TargetOpcode::G_CTTZ:
1133       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1134         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1135       case TargetOpcode::G_CTPOP:
1136         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1137       default:
1138         return UnableToLegalize;
1139       }
1140 
1141     Observer.changingInstr(MI);
1142     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1143     Observer.changedInstr(MI);
1144     return Legalized;
1145   case TargetOpcode::G_INTTOPTR:
1146     if (TypeIdx != 1)
1147       return UnableToLegalize;
1148 
1149     Observer.changingInstr(MI);
1150     narrowScalarSrc(MI, NarrowTy, 1);
1151     Observer.changedInstr(MI);
1152     return Legalized;
1153   case TargetOpcode::G_PTRTOINT:
1154     if (TypeIdx != 0)
1155       return UnableToLegalize;
1156 
1157     Observer.changingInstr(MI);
1158     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1159     Observer.changedInstr(MI);
1160     return Legalized;
1161   case TargetOpcode::G_PHI: {
1162     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1163     // NarrowSize.
1164     if (SizeOp0 % NarrowSize != 0)
1165       return UnableToLegalize;
1166 
1167     unsigned NumParts = SizeOp0 / NarrowSize;
1168     SmallVector<Register, 2> DstRegs(NumParts);
1169     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1170     Observer.changingInstr(MI);
1171     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1172       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1173       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1174       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1175                    SrcRegs[i / 2]);
1176     }
1177     MachineBasicBlock &MBB = *MI.getParent();
1178     MIRBuilder.setInsertPt(MBB, MI);
1179     for (unsigned i = 0; i < NumParts; ++i) {
1180       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1181       MachineInstrBuilder MIB =
1182           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1183       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1184         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1185     }
1186     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1187     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1188     Observer.changedInstr(MI);
1189     MI.eraseFromParent();
1190     return Legalized;
1191   }
1192   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1193   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1194     if (TypeIdx != 2)
1195       return UnableToLegalize;
1196 
1197     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1198     Observer.changingInstr(MI);
1199     narrowScalarSrc(MI, NarrowTy, OpIdx);
1200     Observer.changedInstr(MI);
1201     return Legalized;
1202   }
1203   case TargetOpcode::G_ICMP: {
1204     Register LHS = MI.getOperand(2).getReg();
1205     LLT SrcTy = MRI.getType(LHS);
1206     uint64_t SrcSize = SrcTy.getSizeInBits();
1207     CmpInst::Predicate Pred =
1208         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1209 
1210     // TODO: Handle the non-equality case for weird sizes.
1211     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1212       return UnableToLegalize;
1213 
1214     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1215     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1216     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1217                       LHSLeftoverRegs))
1218       return UnableToLegalize;
1219 
1220     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1221     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1222     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1223                       RHSPartRegs, RHSLeftoverRegs))
1224       return UnableToLegalize;
1225 
1226     // We now have the LHS and RHS of the compare split into narrow-type
1227     // registers, plus potentially some leftover type.
1228     Register Dst = MI.getOperand(0).getReg();
1229     LLT ResTy = MRI.getType(Dst);
1230     if (ICmpInst::isEquality(Pred)) {
1231       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1232       // them together. For each equal part, the result should be all 0s. For
1233       // each non-equal part, we'll get at least one 1.
1234       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1235       SmallVector<Register, 4> Xors;
1236       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1237         auto LHS = std::get<0>(LHSAndRHS);
1238         auto RHS = std::get<1>(LHSAndRHS);
1239         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1240         Xors.push_back(Xor);
1241       }
1242 
1243       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1244       // to the desired narrow type so that we can OR them together later.
1245       SmallVector<Register, 4> WidenedXors;
1246       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1247         auto LHS = std::get<0>(LHSAndRHS);
1248         auto RHS = std::get<1>(LHSAndRHS);
1249         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1250         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1251         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1252                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1253         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1254       }
1255 
1256       // Now, for each part we broke up, we know if they are equal/not equal
1257       // based off the G_XOR. We can OR these all together and compare against
1258       // 0 to get the result.
1259       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1260       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1261       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1262         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1263       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1264     } else {
1265       // TODO: Handle non-power-of-two types.
1266       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1267       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1268       Register LHSL = LHSPartRegs[0];
1269       Register LHSH = LHSPartRegs[1];
1270       Register RHSL = RHSPartRegs[0];
1271       Register RHSH = RHSPartRegs[1];
1272       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1273       MachineInstrBuilder CmpHEQ =
1274           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1275       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1276           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1277       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1278     }
1279     MI.eraseFromParent();
1280     return Legalized;
1281   }
1282   case TargetOpcode::G_SEXT_INREG: {
1283     if (TypeIdx != 0)
1284       return UnableToLegalize;
1285 
1286     int64_t SizeInBits = MI.getOperand(2).getImm();
1287 
1288     // So long as the new type has more bits than the bits we're extending we
1289     // don't need to break it apart.
1290     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1291       Observer.changingInstr(MI);
1292       // We don't lose any non-extension bits by truncating the src and
1293       // sign-extending the dst.
1294       MachineOperand &MO1 = MI.getOperand(1);
1295       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1296       MO1.setReg(TruncMIB.getReg(0));
1297 
1298       MachineOperand &MO2 = MI.getOperand(0);
1299       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1300       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1301       MIRBuilder.buildSExt(MO2, DstExt);
1302       MO2.setReg(DstExt);
1303       Observer.changedInstr(MI);
1304       return Legalized;
1305     }
1306 
1307     // Break it apart. Components below the extension point are unmodified. The
1308     // component containing the extension point becomes a narrower SEXT_INREG.
1309     // Components above it are ashr'd from the component containing the
1310     // extension point.
1311     if (SizeOp0 % NarrowSize != 0)
1312       return UnableToLegalize;
1313     int NumParts = SizeOp0 / NarrowSize;
1314 
1315     // List the registers where the destination will be scattered.
1316     SmallVector<Register, 2> DstRegs;
1317     // List the registers where the source will be split.
1318     SmallVector<Register, 2> SrcRegs;
1319 
1320     // Create all the temporary registers.
1321     for (int i = 0; i < NumParts; ++i) {
1322       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1323 
1324       SrcRegs.push_back(SrcReg);
1325     }
1326 
1327     // Explode the big arguments into smaller chunks.
1328     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1329 
1330     Register AshrCstReg =
1331         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1332             .getReg(0);
1333     Register FullExtensionReg = 0;
1334     Register PartialExtensionReg = 0;
1335 
1336     // Do the operation on each small part.
1337     for (int i = 0; i < NumParts; ++i) {
1338       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1339         DstRegs.push_back(SrcRegs[i]);
1340       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1341         assert(PartialExtensionReg &&
1342                "Expected to visit partial extension before full");
1343         if (FullExtensionReg) {
1344           DstRegs.push_back(FullExtensionReg);
1345           continue;
1346         }
1347         DstRegs.push_back(
1348             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1349                 .getReg(0));
1350         FullExtensionReg = DstRegs.back();
1351       } else {
1352         DstRegs.push_back(
1353             MIRBuilder
1354                 .buildInstr(
1355                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1356                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1357                 .getReg(0));
1358         PartialExtensionReg = DstRegs.back();
1359       }
1360     }
1361 
1362     // Gather the destination registers into the final destination.
1363     Register DstReg = MI.getOperand(0).getReg();
1364     MIRBuilder.buildMerge(DstReg, DstRegs);
1365     MI.eraseFromParent();
1366     return Legalized;
1367   }
1368   case TargetOpcode::G_BSWAP:
1369   case TargetOpcode::G_BITREVERSE: {
1370     if (SizeOp0 % NarrowSize != 0)
1371       return UnableToLegalize;
1372 
1373     Observer.changingInstr(MI);
1374     SmallVector<Register, 2> SrcRegs, DstRegs;
1375     unsigned NumParts = SizeOp0 / NarrowSize;
1376     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1377 
1378     for (unsigned i = 0; i < NumParts; ++i) {
1379       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1380                                            {SrcRegs[NumParts - 1 - i]});
1381       DstRegs.push_back(DstPart.getReg(0));
1382     }
1383 
1384     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1385 
1386     Observer.changedInstr(MI);
1387     MI.eraseFromParent();
1388     return Legalized;
1389   }
1390   case TargetOpcode::G_PTR_ADD:
1391   case TargetOpcode::G_PTRMASK: {
1392     if (TypeIdx != 1)
1393       return UnableToLegalize;
1394     Observer.changingInstr(MI);
1395     narrowScalarSrc(MI, NarrowTy, 2);
1396     Observer.changedInstr(MI);
1397     return Legalized;
1398   }
1399   case TargetOpcode::G_FPTOUI:
1400   case TargetOpcode::G_FPTOSI:
1401     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1402   case TargetOpcode::G_FPEXT:
1403     if (TypeIdx != 0)
1404       return UnableToLegalize;
1405     Observer.changingInstr(MI);
1406     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1407     Observer.changedInstr(MI);
1408     return Legalized;
1409   }
1410 }
1411 
1412 Register LegalizerHelper::coerceToScalar(Register Val) {
1413   LLT Ty = MRI.getType(Val);
1414   if (Ty.isScalar())
1415     return Val;
1416 
1417   const DataLayout &DL = MIRBuilder.getDataLayout();
1418   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1419   if (Ty.isPointer()) {
1420     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1421       return Register();
1422     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1423   }
1424 
1425   Register NewVal = Val;
1426 
1427   assert(Ty.isVector());
1428   LLT EltTy = Ty.getElementType();
1429   if (EltTy.isPointer())
1430     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1431   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1432 }
1433 
1434 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1435                                      unsigned OpIdx, unsigned ExtOpcode) {
1436   MachineOperand &MO = MI.getOperand(OpIdx);
1437   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1438   MO.setReg(ExtB.getReg(0));
1439 }
1440 
1441 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1442                                       unsigned OpIdx) {
1443   MachineOperand &MO = MI.getOperand(OpIdx);
1444   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1445   MO.setReg(ExtB.getReg(0));
1446 }
1447 
1448 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1449                                      unsigned OpIdx, unsigned TruncOpcode) {
1450   MachineOperand &MO = MI.getOperand(OpIdx);
1451   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1452   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1453   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1454   MO.setReg(DstExt);
1455 }
1456 
1457 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1458                                       unsigned OpIdx, unsigned ExtOpcode) {
1459   MachineOperand &MO = MI.getOperand(OpIdx);
1460   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1461   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1462   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1463   MO.setReg(DstTrunc);
1464 }
1465 
1466 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1467                                             unsigned OpIdx) {
1468   MachineOperand &MO = MI.getOperand(OpIdx);
1469   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1470   Register Dst = MO.getReg();
1471   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1472   MO.setReg(DstExt);
1473   MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
1474 }
1475 
1476 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1477                                             unsigned OpIdx) {
1478   MachineOperand &MO = MI.getOperand(OpIdx);
1479   SmallVector<Register, 8> Regs;
1480   MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
1481 }
1482 
1483 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1484   MachineOperand &Op = MI.getOperand(OpIdx);
1485   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1486 }
1487 
1488 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1489   MachineOperand &MO = MI.getOperand(OpIdx);
1490   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1491   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1492   MIRBuilder.buildBitcast(MO, CastDst);
1493   MO.setReg(CastDst);
1494 }
1495 
1496 LegalizerHelper::LegalizeResult
1497 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1498                                         LLT WideTy) {
1499   if (TypeIdx != 1)
1500     return UnableToLegalize;
1501 
1502   Register DstReg = MI.getOperand(0).getReg();
1503   LLT DstTy = MRI.getType(DstReg);
1504   if (DstTy.isVector())
1505     return UnableToLegalize;
1506 
1507   Register Src1 = MI.getOperand(1).getReg();
1508   LLT SrcTy = MRI.getType(Src1);
1509   const int DstSize = DstTy.getSizeInBits();
1510   const int SrcSize = SrcTy.getSizeInBits();
1511   const int WideSize = WideTy.getSizeInBits();
1512   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1513 
1514   unsigned NumOps = MI.getNumOperands();
1515   unsigned NumSrc = MI.getNumOperands() - 1;
1516   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1517 
1518   if (WideSize >= DstSize) {
1519     // Directly pack the bits in the target type.
1520     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1521 
1522     for (unsigned I = 2; I != NumOps; ++I) {
1523       const unsigned Offset = (I - 1) * PartSize;
1524 
1525       Register SrcReg = MI.getOperand(I).getReg();
1526       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1527 
1528       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1529 
1530       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1531         MRI.createGenericVirtualRegister(WideTy);
1532 
1533       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1534       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1535       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1536       ResultReg = NextResult;
1537     }
1538 
1539     if (WideSize > DstSize)
1540       MIRBuilder.buildTrunc(DstReg, ResultReg);
1541     else if (DstTy.isPointer())
1542       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1543 
1544     MI.eraseFromParent();
1545     return Legalized;
1546   }
1547 
1548   // Unmerge the original values to the GCD type, and recombine to the next
1549   // multiple greater than the original type.
1550   //
1551   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1552   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1553   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1554   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1555   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1556   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1557   // %12:_(s12) = G_MERGE_VALUES %10, %11
1558   //
1559   // Padding with undef if necessary:
1560   //
1561   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1562   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1563   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1564   // %7:_(s2) = G_IMPLICIT_DEF
1565   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1566   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1567   // %10:_(s12) = G_MERGE_VALUES %8, %9
1568 
1569   const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1570   LLT GCDTy = LLT::scalar(GCD);
1571 
1572   SmallVector<Register, 8> Parts;
1573   SmallVector<Register, 8> NewMergeRegs;
1574   SmallVector<Register, 8> Unmerges;
1575   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1576 
1577   // Decompose the original operands if they don't evenly divide.
1578   for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
1579     Register SrcReg = MO.getReg();
1580     if (GCD == SrcSize) {
1581       Unmerges.push_back(SrcReg);
1582     } else {
1583       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1584       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1585         Unmerges.push_back(Unmerge.getReg(J));
1586     }
1587   }
1588 
1589   // Pad with undef to the next size that is a multiple of the requested size.
1590   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1591     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1592     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1593       Unmerges.push_back(UndefReg);
1594   }
1595 
1596   const int PartsPerGCD = WideSize / GCD;
1597 
1598   // Build merges of each piece.
1599   ArrayRef<Register> Slicer(Unmerges);
1600   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1601     auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1602     NewMergeRegs.push_back(Merge.getReg(0));
1603   }
1604 
1605   // A truncate may be necessary if the requested type doesn't evenly divide the
1606   // original result type.
1607   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1608     MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1609   } else {
1610     auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1611     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1612   }
1613 
1614   MI.eraseFromParent();
1615   return Legalized;
1616 }
1617 
1618 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
1619   Register WideReg = MRI.createGenericVirtualRegister(WideTy);
1620   LLT OrigTy = MRI.getType(OrigReg);
1621   LLT LCMTy = getLCMType(WideTy, OrigTy);
1622 
1623   const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
1624   const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
1625 
1626   Register UnmergeSrc = WideReg;
1627 
1628   // Create a merge to the LCM type, padding with undef
1629   // %0:_(<3 x s32>) = G_FOO => <4 x s32>
1630   // =>
1631   // %1:_(<4 x s32>) = G_FOO
1632   // %2:_(<4 x s32>) = G_IMPLICIT_DEF
1633   // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
1634   // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
1635   if (NumMergeParts > 1) {
1636     Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
1637     SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
1638     MergeParts[0] = WideReg;
1639     UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
1640   }
1641 
1642   // Unmerge to the original register and pad with dead defs.
1643   SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
1644   UnmergeResults[0] = OrigReg;
1645   for (int I = 1; I != NumUnmergeParts; ++I)
1646     UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
1647 
1648   MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
1649   return WideReg;
1650 }
1651 
1652 LegalizerHelper::LegalizeResult
1653 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1654                                           LLT WideTy) {
1655   if (TypeIdx != 0)
1656     return UnableToLegalize;
1657 
1658   int NumDst = MI.getNumOperands() - 1;
1659   Register SrcReg = MI.getOperand(NumDst).getReg();
1660   LLT SrcTy = MRI.getType(SrcReg);
1661   if (SrcTy.isVector())
1662     return UnableToLegalize;
1663 
1664   Register Dst0Reg = MI.getOperand(0).getReg();
1665   LLT DstTy = MRI.getType(Dst0Reg);
1666   if (!DstTy.isScalar())
1667     return UnableToLegalize;
1668 
1669   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1670     if (SrcTy.isPointer()) {
1671       const DataLayout &DL = MIRBuilder.getDataLayout();
1672       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1673         LLVM_DEBUG(
1674             dbgs() << "Not casting non-integral address space integer\n");
1675         return UnableToLegalize;
1676       }
1677 
1678       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1679       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1680     }
1681 
1682     // Widen SrcTy to WideTy. This does not affect the result, but since the
1683     // user requested this size, it is probably better handled than SrcTy and
1684     // should reduce the total number of legalization artifacts.
1685     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1686       SrcTy = WideTy;
1687       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1688     }
1689 
1690     // Theres no unmerge type to target. Directly extract the bits from the
1691     // source type
1692     unsigned DstSize = DstTy.getSizeInBits();
1693 
1694     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1695     for (int I = 1; I != NumDst; ++I) {
1696       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1697       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1698       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1699     }
1700 
1701     MI.eraseFromParent();
1702     return Legalized;
1703   }
1704 
1705   // Extend the source to a wider type.
1706   LLT LCMTy = getLCMType(SrcTy, WideTy);
1707 
1708   Register WideSrc = SrcReg;
1709   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1710     // TODO: If this is an integral address space, cast to integer and anyext.
1711     if (SrcTy.isPointer()) {
1712       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1713       return UnableToLegalize;
1714     }
1715 
1716     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1717   }
1718 
1719   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1720 
1721   // Create a sequence of unmerges and merges to the original results. Since we
1722   // may have widened the source, we will need to pad the results with dead defs
1723   // to cover the source register.
1724   // e.g. widen s48 to s64:
1725   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1726   //
1727   // =>
1728   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1729   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1730   //  ; unpack to GCD type, with extra dead defs
1731   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1732   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1733   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1734   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1735   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1736   const LLT GCDTy = getGCDType(WideTy, DstTy);
1737   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1738   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1739 
1740   // Directly unmerge to the destination without going through a GCD type
1741   // if possible
1742   if (PartsPerRemerge == 1) {
1743     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1744 
1745     for (int I = 0; I != NumUnmerge; ++I) {
1746       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1747 
1748       for (int J = 0; J != PartsPerUnmerge; ++J) {
1749         int Idx = I * PartsPerUnmerge + J;
1750         if (Idx < NumDst)
1751           MIB.addDef(MI.getOperand(Idx).getReg());
1752         else {
1753           // Create dead def for excess components.
1754           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1755         }
1756       }
1757 
1758       MIB.addUse(Unmerge.getReg(I));
1759     }
1760   } else {
1761     SmallVector<Register, 16> Parts;
1762     for (int J = 0; J != NumUnmerge; ++J)
1763       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1764 
1765     SmallVector<Register, 8> RemergeParts;
1766     for (int I = 0; I != NumDst; ++I) {
1767       for (int J = 0; J < PartsPerRemerge; ++J) {
1768         const int Idx = I * PartsPerRemerge + J;
1769         RemergeParts.emplace_back(Parts[Idx]);
1770       }
1771 
1772       MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1773       RemergeParts.clear();
1774     }
1775   }
1776 
1777   MI.eraseFromParent();
1778   return Legalized;
1779 }
1780 
1781 LegalizerHelper::LegalizeResult
1782 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1783                                     LLT WideTy) {
1784   Register DstReg = MI.getOperand(0).getReg();
1785   Register SrcReg = MI.getOperand(1).getReg();
1786   LLT SrcTy = MRI.getType(SrcReg);
1787 
1788   LLT DstTy = MRI.getType(DstReg);
1789   unsigned Offset = MI.getOperand(2).getImm();
1790 
1791   if (TypeIdx == 0) {
1792     if (SrcTy.isVector() || DstTy.isVector())
1793       return UnableToLegalize;
1794 
1795     SrcOp Src(SrcReg);
1796     if (SrcTy.isPointer()) {
1797       // Extracts from pointers can be handled only if they are really just
1798       // simple integers.
1799       const DataLayout &DL = MIRBuilder.getDataLayout();
1800       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1801         return UnableToLegalize;
1802 
1803       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1804       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1805       SrcTy = SrcAsIntTy;
1806     }
1807 
1808     if (DstTy.isPointer())
1809       return UnableToLegalize;
1810 
1811     if (Offset == 0) {
1812       // Avoid a shift in the degenerate case.
1813       MIRBuilder.buildTrunc(DstReg,
1814                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1815       MI.eraseFromParent();
1816       return Legalized;
1817     }
1818 
1819     // Do a shift in the source type.
1820     LLT ShiftTy = SrcTy;
1821     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1822       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1823       ShiftTy = WideTy;
1824     }
1825 
1826     auto LShr = MIRBuilder.buildLShr(
1827       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1828     MIRBuilder.buildTrunc(DstReg, LShr);
1829     MI.eraseFromParent();
1830     return Legalized;
1831   }
1832 
1833   if (SrcTy.isScalar()) {
1834     Observer.changingInstr(MI);
1835     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1836     Observer.changedInstr(MI);
1837     return Legalized;
1838   }
1839 
1840   if (!SrcTy.isVector())
1841     return UnableToLegalize;
1842 
1843   if (DstTy != SrcTy.getElementType())
1844     return UnableToLegalize;
1845 
1846   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1847     return UnableToLegalize;
1848 
1849   Observer.changingInstr(MI);
1850   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1851 
1852   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1853                           Offset);
1854   widenScalarDst(MI, WideTy.getScalarType(), 0);
1855   Observer.changedInstr(MI);
1856   return Legalized;
1857 }
1858 
1859 LegalizerHelper::LegalizeResult
1860 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1861                                    LLT WideTy) {
1862   if (TypeIdx != 0 || WideTy.isVector())
1863     return UnableToLegalize;
1864   Observer.changingInstr(MI);
1865   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1866   widenScalarDst(MI, WideTy);
1867   Observer.changedInstr(MI);
1868   return Legalized;
1869 }
1870 
1871 LegalizerHelper::LegalizeResult
1872 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1873                                            LLT WideTy) {
1874   if (TypeIdx == 1)
1875     return UnableToLegalize; // TODO
1876 
1877   unsigned Opcode;
1878   unsigned ExtOpcode;
1879   Optional<Register> CarryIn = None;
1880   switch (MI.getOpcode()) {
1881   default:
1882     llvm_unreachable("Unexpected opcode!");
1883   case TargetOpcode::G_SADDO:
1884     Opcode = TargetOpcode::G_ADD;
1885     ExtOpcode = TargetOpcode::G_SEXT;
1886     break;
1887   case TargetOpcode::G_SSUBO:
1888     Opcode = TargetOpcode::G_SUB;
1889     ExtOpcode = TargetOpcode::G_SEXT;
1890     break;
1891   case TargetOpcode::G_UADDO:
1892     Opcode = TargetOpcode::G_ADD;
1893     ExtOpcode = TargetOpcode::G_ZEXT;
1894     break;
1895   case TargetOpcode::G_USUBO:
1896     Opcode = TargetOpcode::G_SUB;
1897     ExtOpcode = TargetOpcode::G_ZEXT;
1898     break;
1899   case TargetOpcode::G_SADDE:
1900     Opcode = TargetOpcode::G_UADDE;
1901     ExtOpcode = TargetOpcode::G_SEXT;
1902     CarryIn = MI.getOperand(4).getReg();
1903     break;
1904   case TargetOpcode::G_SSUBE:
1905     Opcode = TargetOpcode::G_USUBE;
1906     ExtOpcode = TargetOpcode::G_SEXT;
1907     CarryIn = MI.getOperand(4).getReg();
1908     break;
1909   case TargetOpcode::G_UADDE:
1910     Opcode = TargetOpcode::G_UADDE;
1911     ExtOpcode = TargetOpcode::G_ZEXT;
1912     CarryIn = MI.getOperand(4).getReg();
1913     break;
1914   case TargetOpcode::G_USUBE:
1915     Opcode = TargetOpcode::G_USUBE;
1916     ExtOpcode = TargetOpcode::G_ZEXT;
1917     CarryIn = MI.getOperand(4).getReg();
1918     break;
1919   }
1920 
1921   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1922   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1923   // Do the arithmetic in the larger type.
1924   Register NewOp;
1925   if (CarryIn) {
1926     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1927     NewOp = MIRBuilder
1928                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1929                             {LHSExt, RHSExt, *CarryIn})
1930                 .getReg(0);
1931   } else {
1932     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1933   }
1934   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1935   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1936   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1937   // There is no overflow if the ExtOp is the same as NewOp.
1938   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1939   // Now trunc the NewOp to the original result.
1940   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1941   MI.eraseFromParent();
1942   return Legalized;
1943 }
1944 
1945 LegalizerHelper::LegalizeResult
1946 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1947                                          LLT WideTy) {
1948   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1949                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1950                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1951   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1952                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1953   // We can convert this to:
1954   //   1. Any extend iN to iM
1955   //   2. SHL by M-N
1956   //   3. [US][ADD|SUB|SHL]SAT
1957   //   4. L/ASHR by M-N
1958   //
1959   // It may be more efficient to lower this to a min and a max operation in
1960   // the higher precision arithmetic if the promoted operation isn't legal,
1961   // but this decision is up to the target's lowering request.
1962   Register DstReg = MI.getOperand(0).getReg();
1963 
1964   unsigned NewBits = WideTy.getScalarSizeInBits();
1965   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1966 
1967   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1968   // must not left shift the RHS to preserve the shift amount.
1969   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1970   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1971                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1972   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1973   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1974   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1975 
1976   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1977                                         {ShiftL, ShiftR}, MI.getFlags());
1978 
1979   // Use a shift that will preserve the number of sign bits when the trunc is
1980   // folded away.
1981   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1982                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1983 
1984   MIRBuilder.buildTrunc(DstReg, Result);
1985   MI.eraseFromParent();
1986   return Legalized;
1987 }
1988 
1989 LegalizerHelper::LegalizeResult
1990 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1991                                  LLT WideTy) {
1992   if (TypeIdx == 1)
1993     return UnableToLegalize;
1994 
1995   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1996   Register Result = MI.getOperand(0).getReg();
1997   Register OriginalOverflow = MI.getOperand(1).getReg();
1998   Register LHS = MI.getOperand(2).getReg();
1999   Register RHS = MI.getOperand(3).getReg();
2000   LLT SrcTy = MRI.getType(LHS);
2001   LLT OverflowTy = MRI.getType(OriginalOverflow);
2002   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2003 
2004   // To determine if the result overflowed in the larger type, we extend the
2005   // input to the larger type, do the multiply (checking if it overflows),
2006   // then also check the high bits of the result to see if overflow happened
2007   // there.
2008   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2009   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
2010   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
2011 
2012   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
2013                                     {LeftOperand, RightOperand});
2014   auto Mul = Mulo->getOperand(0);
2015   MIRBuilder.buildTrunc(Result, Mul);
2016 
2017   MachineInstrBuilder ExtResult;
2018   // Overflow occurred if it occurred in the larger type, or if the high part
2019   // of the result does not zero/sign-extend the low part.  Check this second
2020   // possibility first.
2021   if (IsSigned) {
2022     // For signed, overflow occurred when the high part does not sign-extend
2023     // the low part.
2024     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2025   } else {
2026     // Unsigned overflow occurred when the high part does not zero-extend the
2027     // low part.
2028     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2029   }
2030 
2031   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2032   // so we don't need to check the overflow result of larger type Mulo.
2033   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
2034     auto Overflow =
2035         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2036     // Finally check if the multiplication in the larger type itself overflowed.
2037     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2038   } else {
2039     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2040   }
2041   MI.eraseFromParent();
2042   return Legalized;
2043 }
2044 
2045 LegalizerHelper::LegalizeResult
2046 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2047   switch (MI.getOpcode()) {
2048   default:
2049     return UnableToLegalize;
2050   case TargetOpcode::G_ATOMICRMW_XCHG:
2051   case TargetOpcode::G_ATOMICRMW_ADD:
2052   case TargetOpcode::G_ATOMICRMW_SUB:
2053   case TargetOpcode::G_ATOMICRMW_AND:
2054   case TargetOpcode::G_ATOMICRMW_OR:
2055   case TargetOpcode::G_ATOMICRMW_XOR:
2056   case TargetOpcode::G_ATOMICRMW_MIN:
2057   case TargetOpcode::G_ATOMICRMW_MAX:
2058   case TargetOpcode::G_ATOMICRMW_UMIN:
2059   case TargetOpcode::G_ATOMICRMW_UMAX:
2060     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2061     Observer.changingInstr(MI);
2062     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2063     widenScalarDst(MI, WideTy, 0);
2064     Observer.changedInstr(MI);
2065     return Legalized;
2066   case TargetOpcode::G_ATOMIC_CMPXCHG:
2067     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2068     Observer.changingInstr(MI);
2069     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2070     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2071     widenScalarDst(MI, WideTy, 0);
2072     Observer.changedInstr(MI);
2073     return Legalized;
2074   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2075     if (TypeIdx == 0) {
2076       Observer.changingInstr(MI);
2077       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2078       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2079       widenScalarDst(MI, WideTy, 0);
2080       Observer.changedInstr(MI);
2081       return Legalized;
2082     }
2083     assert(TypeIdx == 1 &&
2084            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2085     Observer.changingInstr(MI);
2086     widenScalarDst(MI, WideTy, 1);
2087     Observer.changedInstr(MI);
2088     return Legalized;
2089   case TargetOpcode::G_EXTRACT:
2090     return widenScalarExtract(MI, TypeIdx, WideTy);
2091   case TargetOpcode::G_INSERT:
2092     return widenScalarInsert(MI, TypeIdx, WideTy);
2093   case TargetOpcode::G_MERGE_VALUES:
2094     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2095   case TargetOpcode::G_UNMERGE_VALUES:
2096     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2097   case TargetOpcode::G_SADDO:
2098   case TargetOpcode::G_SSUBO:
2099   case TargetOpcode::G_UADDO:
2100   case TargetOpcode::G_USUBO:
2101   case TargetOpcode::G_SADDE:
2102   case TargetOpcode::G_SSUBE:
2103   case TargetOpcode::G_UADDE:
2104   case TargetOpcode::G_USUBE:
2105     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2106   case TargetOpcode::G_UMULO:
2107   case TargetOpcode::G_SMULO:
2108     return widenScalarMulo(MI, TypeIdx, WideTy);
2109   case TargetOpcode::G_SADDSAT:
2110   case TargetOpcode::G_SSUBSAT:
2111   case TargetOpcode::G_SSHLSAT:
2112   case TargetOpcode::G_UADDSAT:
2113   case TargetOpcode::G_USUBSAT:
2114   case TargetOpcode::G_USHLSAT:
2115     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2116   case TargetOpcode::G_CTTZ:
2117   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2118   case TargetOpcode::G_CTLZ:
2119   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2120   case TargetOpcode::G_CTPOP: {
2121     if (TypeIdx == 0) {
2122       Observer.changingInstr(MI);
2123       widenScalarDst(MI, WideTy, 0);
2124       Observer.changedInstr(MI);
2125       return Legalized;
2126     }
2127 
2128     Register SrcReg = MI.getOperand(1).getReg();
2129 
2130     // First extend the input.
2131     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2132                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2133                           ? TargetOpcode::G_ANYEXT
2134                           : TargetOpcode::G_ZEXT;
2135     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2136     LLT CurTy = MRI.getType(SrcReg);
2137     unsigned NewOpc = MI.getOpcode();
2138     if (NewOpc == TargetOpcode::G_CTTZ) {
2139       // The count is the same in the larger type except if the original
2140       // value was zero.  This can be handled by setting the bit just off
2141       // the top of the original type.
2142       auto TopBit =
2143           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2144       MIBSrc = MIRBuilder.buildOr(
2145         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2146       // Now we know the operand is non-zero, use the more relaxed opcode.
2147       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2148     }
2149 
2150     // Perform the operation at the larger size.
2151     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2152     // This is already the correct result for CTPOP and CTTZs
2153     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2154         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2155       // The correct result is NewOp - (Difference in widety and current ty).
2156       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2157       MIBNewOp = MIRBuilder.buildSub(
2158           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2159     }
2160 
2161     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2162     MI.eraseFromParent();
2163     return Legalized;
2164   }
2165   case TargetOpcode::G_BSWAP: {
2166     Observer.changingInstr(MI);
2167     Register DstReg = MI.getOperand(0).getReg();
2168 
2169     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2170     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2171     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2172     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2173 
2174     MI.getOperand(0).setReg(DstExt);
2175 
2176     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2177 
2178     LLT Ty = MRI.getType(DstReg);
2179     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2180     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2181     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2182 
2183     MIRBuilder.buildTrunc(DstReg, ShrReg);
2184     Observer.changedInstr(MI);
2185     return Legalized;
2186   }
2187   case TargetOpcode::G_BITREVERSE: {
2188     Observer.changingInstr(MI);
2189 
2190     Register DstReg = MI.getOperand(0).getReg();
2191     LLT Ty = MRI.getType(DstReg);
2192     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2193 
2194     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2195     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2196     MI.getOperand(0).setReg(DstExt);
2197     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2198 
2199     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2200     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2201     MIRBuilder.buildTrunc(DstReg, Shift);
2202     Observer.changedInstr(MI);
2203     return Legalized;
2204   }
2205   case TargetOpcode::G_FREEZE:
2206     Observer.changingInstr(MI);
2207     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2208     widenScalarDst(MI, WideTy);
2209     Observer.changedInstr(MI);
2210     return Legalized;
2211 
2212   case TargetOpcode::G_ABS:
2213     Observer.changingInstr(MI);
2214     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2215     widenScalarDst(MI, WideTy);
2216     Observer.changedInstr(MI);
2217     return Legalized;
2218 
2219   case TargetOpcode::G_ADD:
2220   case TargetOpcode::G_AND:
2221   case TargetOpcode::G_MUL:
2222   case TargetOpcode::G_OR:
2223   case TargetOpcode::G_XOR:
2224   case TargetOpcode::G_SUB:
2225     // Perform operation at larger width (any extension is fines here, high bits
2226     // don't affect the result) and then truncate the result back to the
2227     // original type.
2228     Observer.changingInstr(MI);
2229     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2230     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2231     widenScalarDst(MI, WideTy);
2232     Observer.changedInstr(MI);
2233     return Legalized;
2234 
2235   case TargetOpcode::G_SBFX:
2236   case TargetOpcode::G_UBFX:
2237     Observer.changingInstr(MI);
2238 
2239     if (TypeIdx == 0) {
2240       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2241       widenScalarDst(MI, WideTy);
2242     } else {
2243       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2244       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2245     }
2246 
2247     Observer.changedInstr(MI);
2248     return Legalized;
2249 
2250   case TargetOpcode::G_SHL:
2251     Observer.changingInstr(MI);
2252 
2253     if (TypeIdx == 0) {
2254       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2255       widenScalarDst(MI, WideTy);
2256     } else {
2257       assert(TypeIdx == 1);
2258       // The "number of bits to shift" operand must preserve its value as an
2259       // unsigned integer:
2260       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2261     }
2262 
2263     Observer.changedInstr(MI);
2264     return Legalized;
2265 
2266   case TargetOpcode::G_SDIV:
2267   case TargetOpcode::G_SREM:
2268   case TargetOpcode::G_SMIN:
2269   case TargetOpcode::G_SMAX:
2270     Observer.changingInstr(MI);
2271     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2272     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2273     widenScalarDst(MI, WideTy);
2274     Observer.changedInstr(MI);
2275     return Legalized;
2276 
2277   case TargetOpcode::G_SDIVREM:
2278     Observer.changingInstr(MI);
2279     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2280     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2281     widenScalarDst(MI, WideTy);
2282     widenScalarDst(MI, WideTy, 1);
2283     Observer.changedInstr(MI);
2284     return Legalized;
2285 
2286   case TargetOpcode::G_ASHR:
2287   case TargetOpcode::G_LSHR:
2288     Observer.changingInstr(MI);
2289 
2290     if (TypeIdx == 0) {
2291       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2292         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2293 
2294       widenScalarSrc(MI, WideTy, 1, CvtOp);
2295       widenScalarDst(MI, WideTy);
2296     } else {
2297       assert(TypeIdx == 1);
2298       // The "number of bits to shift" operand must preserve its value as an
2299       // unsigned integer:
2300       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2301     }
2302 
2303     Observer.changedInstr(MI);
2304     return Legalized;
2305   case TargetOpcode::G_UDIV:
2306   case TargetOpcode::G_UREM:
2307   case TargetOpcode::G_UMIN:
2308   case TargetOpcode::G_UMAX:
2309     Observer.changingInstr(MI);
2310     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2311     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2312     widenScalarDst(MI, WideTy);
2313     Observer.changedInstr(MI);
2314     return Legalized;
2315 
2316   case TargetOpcode::G_UDIVREM:
2317     Observer.changingInstr(MI);
2318     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2319     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2320     widenScalarDst(MI, WideTy);
2321     widenScalarDst(MI, WideTy, 1);
2322     Observer.changedInstr(MI);
2323     return Legalized;
2324 
2325   case TargetOpcode::G_SELECT:
2326     Observer.changingInstr(MI);
2327     if (TypeIdx == 0) {
2328       // Perform operation at larger width (any extension is fine here, high
2329       // bits don't affect the result) and then truncate the result back to the
2330       // original type.
2331       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2332       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2333       widenScalarDst(MI, WideTy);
2334     } else {
2335       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2336       // Explicit extension is required here since high bits affect the result.
2337       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2338     }
2339     Observer.changedInstr(MI);
2340     return Legalized;
2341 
2342   case TargetOpcode::G_FPTOSI:
2343   case TargetOpcode::G_FPTOUI:
2344     Observer.changingInstr(MI);
2345 
2346     if (TypeIdx == 0)
2347       widenScalarDst(MI, WideTy);
2348     else
2349       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2350 
2351     Observer.changedInstr(MI);
2352     return Legalized;
2353   case TargetOpcode::G_SITOFP:
2354     Observer.changingInstr(MI);
2355 
2356     if (TypeIdx == 0)
2357       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2358     else
2359       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2360 
2361     Observer.changedInstr(MI);
2362     return Legalized;
2363   case TargetOpcode::G_UITOFP:
2364     Observer.changingInstr(MI);
2365 
2366     if (TypeIdx == 0)
2367       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2368     else
2369       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2370 
2371     Observer.changedInstr(MI);
2372     return Legalized;
2373   case TargetOpcode::G_LOAD:
2374   case TargetOpcode::G_SEXTLOAD:
2375   case TargetOpcode::G_ZEXTLOAD:
2376     Observer.changingInstr(MI);
2377     widenScalarDst(MI, WideTy);
2378     Observer.changedInstr(MI);
2379     return Legalized;
2380 
2381   case TargetOpcode::G_STORE: {
2382     if (TypeIdx != 0)
2383       return UnableToLegalize;
2384 
2385     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2386     if (!Ty.isScalar())
2387       return UnableToLegalize;
2388 
2389     Observer.changingInstr(MI);
2390 
2391     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2392       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2393     widenScalarSrc(MI, WideTy, 0, ExtType);
2394 
2395     Observer.changedInstr(MI);
2396     return Legalized;
2397   }
2398   case TargetOpcode::G_CONSTANT: {
2399     MachineOperand &SrcMO = MI.getOperand(1);
2400     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2401     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2402         MRI.getType(MI.getOperand(0).getReg()));
2403     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2404             ExtOpc == TargetOpcode::G_ANYEXT) &&
2405            "Illegal Extend");
2406     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2407     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2408                            ? SrcVal.sext(WideTy.getSizeInBits())
2409                            : SrcVal.zext(WideTy.getSizeInBits());
2410     Observer.changingInstr(MI);
2411     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2412 
2413     widenScalarDst(MI, WideTy);
2414     Observer.changedInstr(MI);
2415     return Legalized;
2416   }
2417   case TargetOpcode::G_FCONSTANT: {
2418     MachineOperand &SrcMO = MI.getOperand(1);
2419     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2420     APFloat Val = SrcMO.getFPImm()->getValueAPF();
2421     bool LosesInfo;
2422     switch (WideTy.getSizeInBits()) {
2423     case 32:
2424       Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
2425                   &LosesInfo);
2426       break;
2427     case 64:
2428       Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
2429                   &LosesInfo);
2430       break;
2431     default:
2432       return UnableToLegalize;
2433     }
2434 
2435     assert(!LosesInfo && "extend should always be lossless");
2436 
2437     Observer.changingInstr(MI);
2438     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
2439 
2440     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2441     Observer.changedInstr(MI);
2442     return Legalized;
2443   }
2444   case TargetOpcode::G_IMPLICIT_DEF: {
2445     Observer.changingInstr(MI);
2446     widenScalarDst(MI, WideTy);
2447     Observer.changedInstr(MI);
2448     return Legalized;
2449   }
2450   case TargetOpcode::G_BRCOND:
2451     Observer.changingInstr(MI);
2452     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2453     Observer.changedInstr(MI);
2454     return Legalized;
2455 
2456   case TargetOpcode::G_FCMP:
2457     Observer.changingInstr(MI);
2458     if (TypeIdx == 0)
2459       widenScalarDst(MI, WideTy);
2460     else {
2461       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2462       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2463     }
2464     Observer.changedInstr(MI);
2465     return Legalized;
2466 
2467   case TargetOpcode::G_ICMP:
2468     Observer.changingInstr(MI);
2469     if (TypeIdx == 0)
2470       widenScalarDst(MI, WideTy);
2471     else {
2472       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2473                                MI.getOperand(1).getPredicate()))
2474                                ? TargetOpcode::G_SEXT
2475                                : TargetOpcode::G_ZEXT;
2476       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2477       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2478     }
2479     Observer.changedInstr(MI);
2480     return Legalized;
2481 
2482   case TargetOpcode::G_PTR_ADD:
2483     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2484     Observer.changingInstr(MI);
2485     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2486     Observer.changedInstr(MI);
2487     return Legalized;
2488 
2489   case TargetOpcode::G_PHI: {
2490     assert(TypeIdx == 0 && "Expecting only Idx 0");
2491 
2492     Observer.changingInstr(MI);
2493     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2494       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2495       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2496       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2497     }
2498 
2499     MachineBasicBlock &MBB = *MI.getParent();
2500     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2501     widenScalarDst(MI, WideTy);
2502     Observer.changedInstr(MI);
2503     return Legalized;
2504   }
2505   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2506     if (TypeIdx == 0) {
2507       Register VecReg = MI.getOperand(1).getReg();
2508       LLT VecTy = MRI.getType(VecReg);
2509       Observer.changingInstr(MI);
2510 
2511       widenScalarSrc(
2512           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2513           TargetOpcode::G_ANYEXT);
2514 
2515       widenScalarDst(MI, WideTy, 0);
2516       Observer.changedInstr(MI);
2517       return Legalized;
2518     }
2519 
2520     if (TypeIdx != 2)
2521       return UnableToLegalize;
2522     Observer.changingInstr(MI);
2523     // TODO: Probably should be zext
2524     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2525     Observer.changedInstr(MI);
2526     return Legalized;
2527   }
2528   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2529     if (TypeIdx == 1) {
2530       Observer.changingInstr(MI);
2531 
2532       Register VecReg = MI.getOperand(1).getReg();
2533       LLT VecTy = MRI.getType(VecReg);
2534       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2535 
2536       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2537       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2538       widenScalarDst(MI, WideVecTy, 0);
2539       Observer.changedInstr(MI);
2540       return Legalized;
2541     }
2542 
2543     if (TypeIdx == 2) {
2544       Observer.changingInstr(MI);
2545       // TODO: Probably should be zext
2546       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2547       Observer.changedInstr(MI);
2548       return Legalized;
2549     }
2550 
2551     return UnableToLegalize;
2552   }
2553   case TargetOpcode::G_FADD:
2554   case TargetOpcode::G_FMUL:
2555   case TargetOpcode::G_FSUB:
2556   case TargetOpcode::G_FMA:
2557   case TargetOpcode::G_FMAD:
2558   case TargetOpcode::G_FNEG:
2559   case TargetOpcode::G_FABS:
2560   case TargetOpcode::G_FCANONICALIZE:
2561   case TargetOpcode::G_FMINNUM:
2562   case TargetOpcode::G_FMAXNUM:
2563   case TargetOpcode::G_FMINNUM_IEEE:
2564   case TargetOpcode::G_FMAXNUM_IEEE:
2565   case TargetOpcode::G_FMINIMUM:
2566   case TargetOpcode::G_FMAXIMUM:
2567   case TargetOpcode::G_FDIV:
2568   case TargetOpcode::G_FREM:
2569   case TargetOpcode::G_FCEIL:
2570   case TargetOpcode::G_FFLOOR:
2571   case TargetOpcode::G_FCOS:
2572   case TargetOpcode::G_FSIN:
2573   case TargetOpcode::G_FLOG10:
2574   case TargetOpcode::G_FLOG:
2575   case TargetOpcode::G_FLOG2:
2576   case TargetOpcode::G_FRINT:
2577   case TargetOpcode::G_FNEARBYINT:
2578   case TargetOpcode::G_FSQRT:
2579   case TargetOpcode::G_FEXP:
2580   case TargetOpcode::G_FEXP2:
2581   case TargetOpcode::G_FPOW:
2582   case TargetOpcode::G_INTRINSIC_TRUNC:
2583   case TargetOpcode::G_INTRINSIC_ROUND:
2584   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2585     assert(TypeIdx == 0);
2586     Observer.changingInstr(MI);
2587 
2588     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2589       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2590 
2591     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2592     Observer.changedInstr(MI);
2593     return Legalized;
2594   case TargetOpcode::G_FPOWI: {
2595     if (TypeIdx != 0)
2596       return UnableToLegalize;
2597     Observer.changingInstr(MI);
2598     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2599     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2600     Observer.changedInstr(MI);
2601     return Legalized;
2602   }
2603   case TargetOpcode::G_INTTOPTR:
2604     if (TypeIdx != 1)
2605       return UnableToLegalize;
2606 
2607     Observer.changingInstr(MI);
2608     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2609     Observer.changedInstr(MI);
2610     return Legalized;
2611   case TargetOpcode::G_PTRTOINT:
2612     if (TypeIdx != 0)
2613       return UnableToLegalize;
2614 
2615     Observer.changingInstr(MI);
2616     widenScalarDst(MI, WideTy, 0);
2617     Observer.changedInstr(MI);
2618     return Legalized;
2619   case TargetOpcode::G_BUILD_VECTOR: {
2620     Observer.changingInstr(MI);
2621 
2622     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2623     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2624       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2625 
2626     // Avoid changing the result vector type if the source element type was
2627     // requested.
2628     if (TypeIdx == 1) {
2629       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2630     } else {
2631       widenScalarDst(MI, WideTy, 0);
2632     }
2633 
2634     Observer.changedInstr(MI);
2635     return Legalized;
2636   }
2637   case TargetOpcode::G_SEXT_INREG:
2638     if (TypeIdx != 0)
2639       return UnableToLegalize;
2640 
2641     Observer.changingInstr(MI);
2642     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2643     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2644     Observer.changedInstr(MI);
2645     return Legalized;
2646   case TargetOpcode::G_PTRMASK: {
2647     if (TypeIdx != 1)
2648       return UnableToLegalize;
2649     Observer.changingInstr(MI);
2650     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2651     Observer.changedInstr(MI);
2652     return Legalized;
2653   }
2654   }
2655 }
2656 
2657 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2658                              MachineIRBuilder &B, Register Src, LLT Ty) {
2659   auto Unmerge = B.buildUnmerge(Ty, Src);
2660   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2661     Pieces.push_back(Unmerge.getReg(I));
2662 }
2663 
2664 LegalizerHelper::LegalizeResult
2665 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2666   Register Dst = MI.getOperand(0).getReg();
2667   Register Src = MI.getOperand(1).getReg();
2668   LLT DstTy = MRI.getType(Dst);
2669   LLT SrcTy = MRI.getType(Src);
2670 
2671   if (SrcTy.isVector()) {
2672     LLT SrcEltTy = SrcTy.getElementType();
2673     SmallVector<Register, 8> SrcRegs;
2674 
2675     if (DstTy.isVector()) {
2676       int NumDstElt = DstTy.getNumElements();
2677       int NumSrcElt = SrcTy.getNumElements();
2678 
2679       LLT DstEltTy = DstTy.getElementType();
2680       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2681       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2682 
2683       // If there's an element size mismatch, insert intermediate casts to match
2684       // the result element type.
2685       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2686         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2687         //
2688         // =>
2689         //
2690         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2691         // %3:_(<2 x s8>) = G_BITCAST %2
2692         // %4:_(<2 x s8>) = G_BITCAST %3
2693         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2694         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2695         SrcPartTy = SrcEltTy;
2696       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2697         //
2698         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2699         //
2700         // =>
2701         //
2702         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2703         // %3:_(s16) = G_BITCAST %2
2704         // %4:_(s16) = G_BITCAST %3
2705         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2706         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2707         DstCastTy = DstEltTy;
2708       }
2709 
2710       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2711       for (Register &SrcReg : SrcRegs)
2712         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2713     } else
2714       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2715 
2716     MIRBuilder.buildMerge(Dst, SrcRegs);
2717     MI.eraseFromParent();
2718     return Legalized;
2719   }
2720 
2721   if (DstTy.isVector()) {
2722     SmallVector<Register, 8> SrcRegs;
2723     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2724     MIRBuilder.buildMerge(Dst, SrcRegs);
2725     MI.eraseFromParent();
2726     return Legalized;
2727   }
2728 
2729   return UnableToLegalize;
2730 }
2731 
2732 /// Figure out the bit offset into a register when coercing a vector index for
2733 /// the wide element type. This is only for the case when promoting vector to
2734 /// one with larger elements.
2735 //
2736 ///
2737 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2738 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2739 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2740                                                    Register Idx,
2741                                                    unsigned NewEltSize,
2742                                                    unsigned OldEltSize) {
2743   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2744   LLT IdxTy = B.getMRI()->getType(Idx);
2745 
2746   // Now figure out the amount we need to shift to get the target bits.
2747   auto OffsetMask = B.buildConstant(
2748       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
2749   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2750   return B.buildShl(IdxTy, OffsetIdx,
2751                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2752 }
2753 
2754 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2755 /// is casting to a vector with a smaller element size, perform multiple element
2756 /// extracts and merge the results. If this is coercing to a vector with larger
2757 /// elements, index the bitcasted vector and extract the target element with bit
2758 /// operations. This is intended to force the indexing in the native register
2759 /// size for architectures that can dynamically index the register file.
2760 LegalizerHelper::LegalizeResult
2761 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2762                                          LLT CastTy) {
2763   if (TypeIdx != 1)
2764     return UnableToLegalize;
2765 
2766   Register Dst = MI.getOperand(0).getReg();
2767   Register SrcVec = MI.getOperand(1).getReg();
2768   Register Idx = MI.getOperand(2).getReg();
2769   LLT SrcVecTy = MRI.getType(SrcVec);
2770   LLT IdxTy = MRI.getType(Idx);
2771 
2772   LLT SrcEltTy = SrcVecTy.getElementType();
2773   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2774   unsigned OldNumElts = SrcVecTy.getNumElements();
2775 
2776   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2777   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2778 
2779   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2780   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2781   if (NewNumElts > OldNumElts) {
2782     // Decreasing the vector element size
2783     //
2784     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2785     //  =>
2786     //  v4i32:castx = bitcast x:v2i64
2787     //
2788     // i64 = bitcast
2789     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2790     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2791     //
2792     if (NewNumElts % OldNumElts != 0)
2793       return UnableToLegalize;
2794 
2795     // Type of the intermediate result vector.
2796     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2797     LLT MidTy =
2798         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2799 
2800     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2801 
2802     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2803     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2804 
2805     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2806       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2807       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2808       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2809       NewOps[I] = Elt.getReg(0);
2810     }
2811 
2812     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2813     MIRBuilder.buildBitcast(Dst, NewVec);
2814     MI.eraseFromParent();
2815     return Legalized;
2816   }
2817 
2818   if (NewNumElts < OldNumElts) {
2819     if (NewEltSize % OldEltSize != 0)
2820       return UnableToLegalize;
2821 
2822     // This only depends on powers of 2 because we use bit tricks to figure out
2823     // the bit offset we need to shift to get the target element. A general
2824     // expansion could emit division/multiply.
2825     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2826       return UnableToLegalize;
2827 
2828     // Increasing the vector element size.
2829     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2830     //
2831     //   =>
2832     //
2833     // %cast = G_BITCAST %vec
2834     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2835     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2836     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2837     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2838     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2839     // %elt = G_TRUNC %elt_bits
2840 
2841     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2842     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2843 
2844     // Divide to get the index in the wider element type.
2845     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2846 
2847     Register WideElt = CastVec;
2848     if (CastTy.isVector()) {
2849       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2850                                                      ScaledIdx).getReg(0);
2851     }
2852 
2853     // Compute the bit offset into the register of the target element.
2854     Register OffsetBits = getBitcastWiderVectorElementOffset(
2855       MIRBuilder, Idx, NewEltSize, OldEltSize);
2856 
2857     // Shift the wide element to get the target element.
2858     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2859     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2860     MI.eraseFromParent();
2861     return Legalized;
2862   }
2863 
2864   return UnableToLegalize;
2865 }
2866 
2867 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2868 /// TargetReg, while preserving other bits in \p TargetReg.
2869 ///
2870 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2871 static Register buildBitFieldInsert(MachineIRBuilder &B,
2872                                     Register TargetReg, Register InsertReg,
2873                                     Register OffsetBits) {
2874   LLT TargetTy = B.getMRI()->getType(TargetReg);
2875   LLT InsertTy = B.getMRI()->getType(InsertReg);
2876   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2877   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2878 
2879   // Produce a bitmask of the value to insert
2880   auto EltMask = B.buildConstant(
2881     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2882                                    InsertTy.getSizeInBits()));
2883   // Shift it into position
2884   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2885   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2886 
2887   // Clear out the bits in the wide element
2888   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2889 
2890   // The value to insert has all zeros already, so stick it into the masked
2891   // wide element.
2892   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2893 }
2894 
2895 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2896 /// is increasing the element size, perform the indexing in the target element
2897 /// type, and use bit operations to insert at the element position. This is
2898 /// intended for architectures that can dynamically index the register file and
2899 /// want to force indexing in the native register size.
2900 LegalizerHelper::LegalizeResult
2901 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2902                                         LLT CastTy) {
2903   if (TypeIdx != 0)
2904     return UnableToLegalize;
2905 
2906   Register Dst = MI.getOperand(0).getReg();
2907   Register SrcVec = MI.getOperand(1).getReg();
2908   Register Val = MI.getOperand(2).getReg();
2909   Register Idx = MI.getOperand(3).getReg();
2910 
2911   LLT VecTy = MRI.getType(Dst);
2912   LLT IdxTy = MRI.getType(Idx);
2913 
2914   LLT VecEltTy = VecTy.getElementType();
2915   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2916   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2917   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2918 
2919   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2920   unsigned OldNumElts = VecTy.getNumElements();
2921 
2922   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2923   if (NewNumElts < OldNumElts) {
2924     if (NewEltSize % OldEltSize != 0)
2925       return UnableToLegalize;
2926 
2927     // This only depends on powers of 2 because we use bit tricks to figure out
2928     // the bit offset we need to shift to get the target element. A general
2929     // expansion could emit division/multiply.
2930     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2931       return UnableToLegalize;
2932 
2933     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2934     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2935 
2936     // Divide to get the index in the wider element type.
2937     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2938 
2939     Register ExtractedElt = CastVec;
2940     if (CastTy.isVector()) {
2941       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2942                                                           ScaledIdx).getReg(0);
2943     }
2944 
2945     // Compute the bit offset into the register of the target element.
2946     Register OffsetBits = getBitcastWiderVectorElementOffset(
2947       MIRBuilder, Idx, NewEltSize, OldEltSize);
2948 
2949     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2950                                                Val, OffsetBits);
2951     if (CastTy.isVector()) {
2952       InsertedElt = MIRBuilder.buildInsertVectorElement(
2953         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2954     }
2955 
2956     MIRBuilder.buildBitcast(Dst, InsertedElt);
2957     MI.eraseFromParent();
2958     return Legalized;
2959   }
2960 
2961   return UnableToLegalize;
2962 }
2963 
2964 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2965   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2966   Register DstReg = LoadMI.getDstReg();
2967   Register PtrReg = LoadMI.getPointerReg();
2968   LLT DstTy = MRI.getType(DstReg);
2969   MachineMemOperand &MMO = LoadMI.getMMO();
2970   LLT MemTy = MMO.getMemoryType();
2971   MachineFunction &MF = MIRBuilder.getMF();
2972 
2973   unsigned MemSizeInBits = MemTy.getSizeInBits();
2974   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2975 
2976   if (MemSizeInBits != MemStoreSizeInBits) {
2977     if (MemTy.isVector())
2978       return UnableToLegalize;
2979 
2980     // Promote to a byte-sized load if not loading an integral number of
2981     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2982     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2983     MachineMemOperand *NewMMO =
2984         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2985 
2986     Register LoadReg = DstReg;
2987     LLT LoadTy = DstTy;
2988 
2989     // If this wasn't already an extending load, we need to widen the result
2990     // register to avoid creating a load with a narrower result than the source.
2991     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2992       LoadTy = WideMemTy;
2993       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2994     }
2995 
2996     if (isa<GSExtLoad>(LoadMI)) {
2997       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2998       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2999     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) {
3000       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
3001       // The extra bits are guaranteed to be zero, since we stored them that
3002       // way.  A zext load from Wide thus automatically gives zext from MemVT.
3003       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
3004     } else {
3005       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
3006     }
3007 
3008     if (DstTy != LoadTy)
3009       MIRBuilder.buildTrunc(DstReg, LoadReg);
3010 
3011     LoadMI.eraseFromParent();
3012     return Legalized;
3013   }
3014 
3015   // Big endian lowering not implemented.
3016   if (MIRBuilder.getDataLayout().isBigEndian())
3017     return UnableToLegalize;
3018 
3019   // This load needs splitting into power of 2 sized loads.
3020   //
3021   // Our strategy here is to generate anyextending loads for the smaller
3022   // types up to next power-2 result type, and then combine the two larger
3023   // result values together, before truncating back down to the non-pow-2
3024   // type.
3025   // E.g. v1 = i24 load =>
3026   // v2 = i32 zextload (2 byte)
3027   // v3 = i32 load (1 byte)
3028   // v4 = i32 shl v3, 16
3029   // v5 = i32 or v4, v2
3030   // v1 = i24 trunc v5
3031   // By doing this we generate the correct truncate which should get
3032   // combined away as an artifact with a matching extend.
3033 
3034   uint64_t LargeSplitSize, SmallSplitSize;
3035 
3036   if (!isPowerOf2_32(MemSizeInBits)) {
3037     // This load needs splitting into power of 2 sized loads.
3038     LargeSplitSize = PowerOf2Floor(MemSizeInBits);
3039     SmallSplitSize = MemSizeInBits - LargeSplitSize;
3040   } else {
3041     // This is already a power of 2, but we still need to split this in half.
3042     //
3043     // Assume we're being asked to decompose an unaligned load.
3044     // TODO: If this requires multiple splits, handle them all at once.
3045     auto &Ctx = MF.getFunction().getContext();
3046     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3047       return UnableToLegalize;
3048 
3049     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3050   }
3051 
3052   if (MemTy.isVector()) {
3053     // TODO: Handle vector extloads
3054     if (MemTy != DstTy)
3055       return UnableToLegalize;
3056 
3057     // TODO: We can do better than scalarizing the vector and at least split it
3058     // in half.
3059     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
3060   }
3061 
3062   MachineMemOperand *LargeMMO =
3063       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3064   MachineMemOperand *SmallMMO =
3065       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3066 
3067   LLT PtrTy = MRI.getType(PtrReg);
3068   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
3069   LLT AnyExtTy = LLT::scalar(AnyExtSize);
3070   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
3071                                              PtrReg, *LargeMMO);
3072 
3073   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
3074                                             LargeSplitSize / 8);
3075   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3076   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3077   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3078                                              SmallPtr, *SmallMMO);
3079 
3080   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3081   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3082 
3083   if (AnyExtTy == DstTy)
3084     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3085   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3086     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3087     MIRBuilder.buildTrunc(DstReg, {Or});
3088   } else {
3089     assert(DstTy.isPointer() && "expected pointer");
3090     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3091 
3092     // FIXME: We currently consider this to be illegal for non-integral address
3093     // spaces, but we need still need a way to reinterpret the bits.
3094     MIRBuilder.buildIntToPtr(DstReg, Or);
3095   }
3096 
3097   LoadMI.eraseFromParent();
3098   return Legalized;
3099 }
3100 
3101 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3102   // Lower a non-power of 2 store into multiple pow-2 stores.
3103   // E.g. split an i24 store into an i16 store + i8 store.
3104   // We do this by first extending the stored value to the next largest power
3105   // of 2 type, and then using truncating stores to store the components.
3106   // By doing this, likewise with G_LOAD, generate an extend that can be
3107   // artifact-combined away instead of leaving behind extracts.
3108   Register SrcReg = StoreMI.getValueReg();
3109   Register PtrReg = StoreMI.getPointerReg();
3110   LLT SrcTy = MRI.getType(SrcReg);
3111   MachineFunction &MF = MIRBuilder.getMF();
3112   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3113   LLT MemTy = MMO.getMemoryType();
3114 
3115   unsigned StoreWidth = MemTy.getSizeInBits();
3116   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3117 
3118   if (StoreWidth != StoreSizeInBits) {
3119     if (SrcTy.isVector())
3120       return UnableToLegalize;
3121 
3122     // Promote to a byte-sized store with upper bits zero if not
3123     // storing an integral number of bytes.  For example, promote
3124     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3125     LLT WideTy = LLT::scalar(StoreSizeInBits);
3126 
3127     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3128       // Avoid creating a store with a narrower source than result.
3129       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3130       SrcTy = WideTy;
3131     }
3132 
3133     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3134 
3135     MachineMemOperand *NewMMO =
3136         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3137     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3138     StoreMI.eraseFromParent();
3139     return Legalized;
3140   }
3141 
3142   if (MemTy.isVector()) {
3143     // TODO: Handle vector trunc stores
3144     if (MemTy != SrcTy)
3145       return UnableToLegalize;
3146 
3147     // TODO: We can do better than scalarizing the vector and at least split it
3148     // in half.
3149     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3150   }
3151 
3152   unsigned MemSizeInBits = MemTy.getSizeInBits();
3153   uint64_t LargeSplitSize, SmallSplitSize;
3154 
3155   if (!isPowerOf2_32(MemSizeInBits)) {
3156     LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
3157     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3158   } else {
3159     auto &Ctx = MF.getFunction().getContext();
3160     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3161       return UnableToLegalize; // Don't know what we're being asked to do.
3162 
3163     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3164   }
3165 
3166   // Extend to the next pow-2. If this store was itself the result of lowering,
3167   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3168   // that's wider than the stored size.
3169   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3170   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3171 
3172   if (SrcTy.isPointer()) {
3173     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3174     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3175   }
3176 
3177   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3178 
3179   // Obtain the smaller value by shifting away the larger value.
3180   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3181   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3182 
3183   // Generate the PtrAdd and truncating stores.
3184   LLT PtrTy = MRI.getType(PtrReg);
3185   auto OffsetCst = MIRBuilder.buildConstant(
3186     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3187   auto SmallPtr =
3188     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3189 
3190   MachineMemOperand *LargeMMO =
3191     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3192   MachineMemOperand *SmallMMO =
3193     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3194   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3195   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3196   StoreMI.eraseFromParent();
3197   return Legalized;
3198 }
3199 
3200 LegalizerHelper::LegalizeResult
3201 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3202   switch (MI.getOpcode()) {
3203   case TargetOpcode::G_LOAD: {
3204     if (TypeIdx != 0)
3205       return UnableToLegalize;
3206     MachineMemOperand &MMO = **MI.memoperands_begin();
3207 
3208     // Not sure how to interpret a bitcast of an extending load.
3209     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3210       return UnableToLegalize;
3211 
3212     Observer.changingInstr(MI);
3213     bitcastDst(MI, CastTy, 0);
3214     MMO.setType(CastTy);
3215     Observer.changedInstr(MI);
3216     return Legalized;
3217   }
3218   case TargetOpcode::G_STORE: {
3219     if (TypeIdx != 0)
3220       return UnableToLegalize;
3221 
3222     MachineMemOperand &MMO = **MI.memoperands_begin();
3223 
3224     // Not sure how to interpret a bitcast of a truncating store.
3225     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3226       return UnableToLegalize;
3227 
3228     Observer.changingInstr(MI);
3229     bitcastSrc(MI, CastTy, 0);
3230     MMO.setType(CastTy);
3231     Observer.changedInstr(MI);
3232     return Legalized;
3233   }
3234   case TargetOpcode::G_SELECT: {
3235     if (TypeIdx != 0)
3236       return UnableToLegalize;
3237 
3238     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3239       LLVM_DEBUG(
3240           dbgs() << "bitcast action not implemented for vector select\n");
3241       return UnableToLegalize;
3242     }
3243 
3244     Observer.changingInstr(MI);
3245     bitcastSrc(MI, CastTy, 2);
3246     bitcastSrc(MI, CastTy, 3);
3247     bitcastDst(MI, CastTy, 0);
3248     Observer.changedInstr(MI);
3249     return Legalized;
3250   }
3251   case TargetOpcode::G_AND:
3252   case TargetOpcode::G_OR:
3253   case TargetOpcode::G_XOR: {
3254     Observer.changingInstr(MI);
3255     bitcastSrc(MI, CastTy, 1);
3256     bitcastSrc(MI, CastTy, 2);
3257     bitcastDst(MI, CastTy, 0);
3258     Observer.changedInstr(MI);
3259     return Legalized;
3260   }
3261   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3262     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3263   case TargetOpcode::G_INSERT_VECTOR_ELT:
3264     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3265   default:
3266     return UnableToLegalize;
3267   }
3268 }
3269 
3270 // Legalize an instruction by changing the opcode in place.
3271 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3272     Observer.changingInstr(MI);
3273     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3274     Observer.changedInstr(MI);
3275 }
3276 
3277 LegalizerHelper::LegalizeResult
3278 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3279   using namespace TargetOpcode;
3280 
3281   switch(MI.getOpcode()) {
3282   default:
3283     return UnableToLegalize;
3284   case TargetOpcode::G_BITCAST:
3285     return lowerBitcast(MI);
3286   case TargetOpcode::G_SREM:
3287   case TargetOpcode::G_UREM: {
3288     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3289     auto Quot =
3290         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3291                               {MI.getOperand(1), MI.getOperand(2)});
3292 
3293     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3294     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3295     MI.eraseFromParent();
3296     return Legalized;
3297   }
3298   case TargetOpcode::G_SADDO:
3299   case TargetOpcode::G_SSUBO:
3300     return lowerSADDO_SSUBO(MI);
3301   case TargetOpcode::G_UMULH:
3302   case TargetOpcode::G_SMULH:
3303     return lowerSMULH_UMULH(MI);
3304   case TargetOpcode::G_SMULO:
3305   case TargetOpcode::G_UMULO: {
3306     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3307     // result.
3308     Register Res = MI.getOperand(0).getReg();
3309     Register Overflow = MI.getOperand(1).getReg();
3310     Register LHS = MI.getOperand(2).getReg();
3311     Register RHS = MI.getOperand(3).getReg();
3312     LLT Ty = MRI.getType(Res);
3313 
3314     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3315                           ? TargetOpcode::G_SMULH
3316                           : TargetOpcode::G_UMULH;
3317 
3318     Observer.changingInstr(MI);
3319     const auto &TII = MIRBuilder.getTII();
3320     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3321     MI.RemoveOperand(1);
3322     Observer.changedInstr(MI);
3323 
3324     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3325     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3326 
3327     // Move insert point forward so we can use the Res register if needed.
3328     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3329 
3330     // For *signed* multiply, overflow is detected by checking:
3331     // (hi != (lo >> bitwidth-1))
3332     if (Opcode == TargetOpcode::G_SMULH) {
3333       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3334       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3335       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3336     } else {
3337       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3338     }
3339     return Legalized;
3340   }
3341   case TargetOpcode::G_FNEG: {
3342     Register Res = MI.getOperand(0).getReg();
3343     LLT Ty = MRI.getType(Res);
3344 
3345     // TODO: Handle vector types once we are able to
3346     // represent them.
3347     if (Ty.isVector())
3348       return UnableToLegalize;
3349     auto SignMask =
3350         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3351     Register SubByReg = MI.getOperand(1).getReg();
3352     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3353     MI.eraseFromParent();
3354     return Legalized;
3355   }
3356   case TargetOpcode::G_FSUB: {
3357     Register Res = MI.getOperand(0).getReg();
3358     LLT Ty = MRI.getType(Res);
3359 
3360     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3361     // First, check if G_FNEG is marked as Lower. If so, we may
3362     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3363     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3364       return UnableToLegalize;
3365     Register LHS = MI.getOperand(1).getReg();
3366     Register RHS = MI.getOperand(2).getReg();
3367     Register Neg = MRI.createGenericVirtualRegister(Ty);
3368     MIRBuilder.buildFNeg(Neg, RHS);
3369     MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3370     MI.eraseFromParent();
3371     return Legalized;
3372   }
3373   case TargetOpcode::G_FMAD:
3374     return lowerFMad(MI);
3375   case TargetOpcode::G_FFLOOR:
3376     return lowerFFloor(MI);
3377   case TargetOpcode::G_INTRINSIC_ROUND:
3378     return lowerIntrinsicRound(MI);
3379   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3380     // Since round even is the assumed rounding mode for unconstrained FP
3381     // operations, rint and roundeven are the same operation.
3382     changeOpcode(MI, TargetOpcode::G_FRINT);
3383     return Legalized;
3384   }
3385   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3386     Register OldValRes = MI.getOperand(0).getReg();
3387     Register SuccessRes = MI.getOperand(1).getReg();
3388     Register Addr = MI.getOperand(2).getReg();
3389     Register CmpVal = MI.getOperand(3).getReg();
3390     Register NewVal = MI.getOperand(4).getReg();
3391     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3392                                   **MI.memoperands_begin());
3393     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3394     MI.eraseFromParent();
3395     return Legalized;
3396   }
3397   case TargetOpcode::G_LOAD:
3398   case TargetOpcode::G_SEXTLOAD:
3399   case TargetOpcode::G_ZEXTLOAD:
3400     return lowerLoad(cast<GAnyLoad>(MI));
3401   case TargetOpcode::G_STORE:
3402     return lowerStore(cast<GStore>(MI));
3403   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3404   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3405   case TargetOpcode::G_CTLZ:
3406   case TargetOpcode::G_CTTZ:
3407   case TargetOpcode::G_CTPOP:
3408     return lowerBitCount(MI);
3409   case G_UADDO: {
3410     Register Res = MI.getOperand(0).getReg();
3411     Register CarryOut = MI.getOperand(1).getReg();
3412     Register LHS = MI.getOperand(2).getReg();
3413     Register RHS = MI.getOperand(3).getReg();
3414 
3415     MIRBuilder.buildAdd(Res, LHS, RHS);
3416     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3417 
3418     MI.eraseFromParent();
3419     return Legalized;
3420   }
3421   case G_UADDE: {
3422     Register Res = MI.getOperand(0).getReg();
3423     Register CarryOut = MI.getOperand(1).getReg();
3424     Register LHS = MI.getOperand(2).getReg();
3425     Register RHS = MI.getOperand(3).getReg();
3426     Register CarryIn = MI.getOperand(4).getReg();
3427     LLT Ty = MRI.getType(Res);
3428 
3429     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3430     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3431     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3432     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3433 
3434     MI.eraseFromParent();
3435     return Legalized;
3436   }
3437   case G_USUBO: {
3438     Register Res = MI.getOperand(0).getReg();
3439     Register BorrowOut = MI.getOperand(1).getReg();
3440     Register LHS = MI.getOperand(2).getReg();
3441     Register RHS = MI.getOperand(3).getReg();
3442 
3443     MIRBuilder.buildSub(Res, LHS, RHS);
3444     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3445 
3446     MI.eraseFromParent();
3447     return Legalized;
3448   }
3449   case G_USUBE: {
3450     Register Res = MI.getOperand(0).getReg();
3451     Register BorrowOut = MI.getOperand(1).getReg();
3452     Register LHS = MI.getOperand(2).getReg();
3453     Register RHS = MI.getOperand(3).getReg();
3454     Register BorrowIn = MI.getOperand(4).getReg();
3455     const LLT CondTy = MRI.getType(BorrowOut);
3456     const LLT Ty = MRI.getType(Res);
3457 
3458     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3459     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3460     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3461 
3462     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3463     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3464     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3465 
3466     MI.eraseFromParent();
3467     return Legalized;
3468   }
3469   case G_UITOFP:
3470     return lowerUITOFP(MI);
3471   case G_SITOFP:
3472     return lowerSITOFP(MI);
3473   case G_FPTOUI:
3474     return lowerFPTOUI(MI);
3475   case G_FPTOSI:
3476     return lowerFPTOSI(MI);
3477   case G_FPTRUNC:
3478     return lowerFPTRUNC(MI);
3479   case G_FPOWI:
3480     return lowerFPOWI(MI);
3481   case G_SMIN:
3482   case G_SMAX:
3483   case G_UMIN:
3484   case G_UMAX:
3485     return lowerMinMax(MI);
3486   case G_FCOPYSIGN:
3487     return lowerFCopySign(MI);
3488   case G_FMINNUM:
3489   case G_FMAXNUM:
3490     return lowerFMinNumMaxNum(MI);
3491   case G_MERGE_VALUES:
3492     return lowerMergeValues(MI);
3493   case G_UNMERGE_VALUES:
3494     return lowerUnmergeValues(MI);
3495   case TargetOpcode::G_SEXT_INREG: {
3496     assert(MI.getOperand(2).isImm() && "Expected immediate");
3497     int64_t SizeInBits = MI.getOperand(2).getImm();
3498 
3499     Register DstReg = MI.getOperand(0).getReg();
3500     Register SrcReg = MI.getOperand(1).getReg();
3501     LLT DstTy = MRI.getType(DstReg);
3502     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3503 
3504     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3505     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3506     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3507     MI.eraseFromParent();
3508     return Legalized;
3509   }
3510   case G_EXTRACT_VECTOR_ELT:
3511   case G_INSERT_VECTOR_ELT:
3512     return lowerExtractInsertVectorElt(MI);
3513   case G_SHUFFLE_VECTOR:
3514     return lowerShuffleVector(MI);
3515   case G_DYN_STACKALLOC:
3516     return lowerDynStackAlloc(MI);
3517   case G_EXTRACT:
3518     return lowerExtract(MI);
3519   case G_INSERT:
3520     return lowerInsert(MI);
3521   case G_BSWAP:
3522     return lowerBswap(MI);
3523   case G_BITREVERSE:
3524     return lowerBitreverse(MI);
3525   case G_READ_REGISTER:
3526   case G_WRITE_REGISTER:
3527     return lowerReadWriteRegister(MI);
3528   case G_UADDSAT:
3529   case G_USUBSAT: {
3530     // Try to make a reasonable guess about which lowering strategy to use. The
3531     // target can override this with custom lowering and calling the
3532     // implementation functions.
3533     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3534     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3535       return lowerAddSubSatToMinMax(MI);
3536     return lowerAddSubSatToAddoSubo(MI);
3537   }
3538   case G_SADDSAT:
3539   case G_SSUBSAT: {
3540     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3541 
3542     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3543     // since it's a shorter expansion. However, we would need to figure out the
3544     // preferred boolean type for the carry out for the query.
3545     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3546       return lowerAddSubSatToMinMax(MI);
3547     return lowerAddSubSatToAddoSubo(MI);
3548   }
3549   case G_SSHLSAT:
3550   case G_USHLSAT:
3551     return lowerShlSat(MI);
3552   case G_ABS:
3553     return lowerAbsToAddXor(MI);
3554   case G_SELECT:
3555     return lowerSelect(MI);
3556   case G_SDIVREM:
3557   case G_UDIVREM:
3558     return lowerDIVREM(MI);
3559   case G_FSHL:
3560   case G_FSHR:
3561     return lowerFunnelShift(MI);
3562   case G_ROTL:
3563   case G_ROTR:
3564     return lowerRotate(MI);
3565   case G_MEMSET:
3566   case G_MEMCPY:
3567   case G_MEMMOVE:
3568     return lowerMemCpyFamily(MI);
3569   case G_MEMCPY_INLINE:
3570     return lowerMemcpyInline(MI);
3571   GISEL_VECREDUCE_CASES_NONSEQ
3572     return lowerVectorReduction(MI);
3573   }
3574 }
3575 
3576 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3577                                                   Align MinAlign) const {
3578   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3579   // datalayout for the preferred alignment. Also there should be a target hook
3580   // for this to allow targets to reduce the alignment and ignore the
3581   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3582   // the type.
3583   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3584 }
3585 
3586 MachineInstrBuilder
3587 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3588                                       MachinePointerInfo &PtrInfo) {
3589   MachineFunction &MF = MIRBuilder.getMF();
3590   const DataLayout &DL = MIRBuilder.getDataLayout();
3591   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3592 
3593   unsigned AddrSpace = DL.getAllocaAddrSpace();
3594   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3595 
3596   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3597   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3598 }
3599 
3600 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3601                                         LLT VecTy) {
3602   int64_t IdxVal;
3603   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3604     return IdxReg;
3605 
3606   LLT IdxTy = B.getMRI()->getType(IdxReg);
3607   unsigned NElts = VecTy.getNumElements();
3608   if (isPowerOf2_32(NElts)) {
3609     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3610     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3611   }
3612 
3613   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3614       .getReg(0);
3615 }
3616 
3617 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3618                                                   Register Index) {
3619   LLT EltTy = VecTy.getElementType();
3620 
3621   // Calculate the element offset and add it to the pointer.
3622   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3623   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3624          "Converting bits to bytes lost precision");
3625 
3626   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3627 
3628   LLT IdxTy = MRI.getType(Index);
3629   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3630                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3631 
3632   LLT PtrTy = MRI.getType(VecPtr);
3633   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3634 }
3635 
3636 #ifndef NDEBUG
3637 /// Check that all vector operands have same number of elements. Other operands
3638 /// should be listed in NonVecOp.
3639 static bool hasSameNumEltsOnAllVectorOperands(
3640     GenericMachineInstr &MI, MachineRegisterInfo &MRI,
3641     std::initializer_list<unsigned> NonVecOpIndices) {
3642   if (MI.getNumMemOperands() != 0)
3643     return false;
3644 
3645   LLT VecTy = MRI.getType(MI.getReg(0));
3646   if (!VecTy.isVector())
3647     return false;
3648   unsigned NumElts = VecTy.getNumElements();
3649 
3650   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
3651     MachineOperand &Op = MI.getOperand(OpIdx);
3652     if (!Op.isReg()) {
3653       if (!is_contained(NonVecOpIndices, OpIdx))
3654         return false;
3655       continue;
3656     }
3657 
3658     LLT Ty = MRI.getType(Op.getReg());
3659     if (!Ty.isVector()) {
3660       if (!is_contained(NonVecOpIndices, OpIdx))
3661         return false;
3662       continue;
3663     }
3664 
3665     if (Ty.getNumElements() != NumElts)
3666       return false;
3667   }
3668 
3669   return true;
3670 }
3671 #endif
3672 
3673 /// Fill \p DstOps with DstOps that have same number of elements combined as
3674 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
3675 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
3676 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
3677 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
3678                        unsigned NumElts) {
3679   LLT LeftoverTy;
3680   assert(Ty.isVector() && "Expected vector type");
3681   LLT EltTy = Ty.getElementType();
3682   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
3683   int NumParts, NumLeftover;
3684   std::tie(NumParts, NumLeftover) =
3685       getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
3686 
3687   assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
3688   for (int i = 0; i < NumParts; ++i) {
3689     DstOps.push_back(NarrowTy);
3690   }
3691 
3692   if (LeftoverTy.isValid()) {
3693     assert(NumLeftover == 1 && "expected exactly one leftover");
3694     DstOps.push_back(LeftoverTy);
3695   }
3696 }
3697 
3698 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
3699 /// made from \p Op depending on operand type.
3700 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
3701                            MachineOperand &Op) {
3702   for (unsigned i = 0; i < N; ++i) {
3703     if (Op.isReg())
3704       Ops.push_back(Op.getReg());
3705     else if (Op.isImm())
3706       Ops.push_back(Op.getImm());
3707     else if (Op.isPredicate())
3708       Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
3709     else
3710       llvm_unreachable("Unsupported type");
3711   }
3712 }
3713 
3714 // Handle splitting vector operations which need to have the same number of
3715 // elements in each type index, but each type index may have a different element
3716 // type.
3717 //
3718 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3719 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3720 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3721 //
3722 // Also handles some irregular breakdown cases, e.g.
3723 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3724 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3725 //             s64 = G_SHL s64, s32
3726 LegalizerHelper::LegalizeResult
3727 LegalizerHelper::fewerElementsVectorMultiEltType(
3728     GenericMachineInstr &MI, unsigned NumElts,
3729     std::initializer_list<unsigned> NonVecOpIndices) {
3730   assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
3731          "Non-compatible opcode or not specified non-vector operands");
3732   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3733 
3734   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3735   unsigned NumDefs = MI.getNumDefs();
3736 
3737   // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
3738   // Build instructions with DstOps to use instruction found by CSE directly.
3739   // CSE copies found instruction into given vreg when building with vreg dest.
3740   SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
3741   // Output registers will be taken from created instructions.
3742   SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
3743   for (unsigned i = 0; i < NumDefs; ++i) {
3744     makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
3745   }
3746 
3747   // Split vector input operands into sub-vectors with NumElts elts + Leftover.
3748   // Operands listed in NonVecOpIndices will be used as is without splitting;
3749   // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
3750   // scalar condition (op 1), immediate in sext_inreg (op 2).
3751   SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
3752   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3753        ++UseIdx, ++UseNo) {
3754     if (is_contained(NonVecOpIndices, UseIdx)) {
3755       broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
3756                      MI.getOperand(UseIdx));
3757     } else {
3758       SmallVector<Register, 8> SplitPieces;
3759       extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces);
3760       for (auto Reg : SplitPieces)
3761         InputOpsPieces[UseNo].push_back(Reg);
3762     }
3763   }
3764 
3765   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3766 
3767   // Take i-th piece of each input operand split and build sub-vector/scalar
3768   // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
3769   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3770     SmallVector<DstOp, 2> Defs;
3771     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3772       Defs.push_back(OutputOpsPieces[DstNo][i]);
3773 
3774     SmallVector<SrcOp, 3> Uses;
3775     for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
3776       Uses.push_back(InputOpsPieces[InputNo][i]);
3777 
3778     auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
3779     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3780       OutputRegs[DstNo].push_back(I.getReg(DstNo));
3781   }
3782 
3783   // Merge small outputs into MI's output for each def operand.
3784   if (NumLeftovers) {
3785     for (unsigned i = 0; i < NumDefs; ++i)
3786       mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
3787   } else {
3788     for (unsigned i = 0; i < NumDefs; ++i)
3789       MIRBuilder.buildMerge(MI.getReg(i), OutputRegs[i]);
3790   }
3791 
3792   MI.eraseFromParent();
3793   return Legalized;
3794 }
3795 
3796 LegalizerHelper::LegalizeResult
3797 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
3798                                         unsigned NumElts) {
3799   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3800 
3801   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3802   unsigned NumDefs = MI.getNumDefs();
3803 
3804   SmallVector<DstOp, 8> OutputOpsPieces;
3805   SmallVector<Register, 8> OutputRegs;
3806   makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
3807 
3808   // Instructions that perform register split will be inserted in basic block
3809   // where register is defined (basic block is in the next operand).
3810   SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
3811   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3812        UseIdx += 2, ++UseNo) {
3813     MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
3814     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3815     extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]);
3816   }
3817 
3818   // Build PHIs with fewer elements.
3819   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3820   MIRBuilder.setInsertPt(*MI.getParent(), MI);
3821   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3822     auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
3823     Phi.addDef(
3824         MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
3825     OutputRegs.push_back(Phi.getReg(0));
3826 
3827     for (unsigned j = 0; j < NumInputs / 2; ++j) {
3828       Phi.addUse(InputOpsPieces[j][i]);
3829       Phi.add(MI.getOperand(1 + j * 2 + 1));
3830     }
3831   }
3832 
3833   // Merge small outputs into MI's def.
3834   if (NumLeftovers) {
3835     mergeMixedSubvectors(MI.getReg(0), OutputRegs);
3836   } else {
3837     MIRBuilder.buildMerge(MI.getReg(0), OutputRegs);
3838   }
3839 
3840   MI.eraseFromParent();
3841   return Legalized;
3842 }
3843 
3844 LegalizerHelper::LegalizeResult
3845 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3846                                                   unsigned TypeIdx,
3847                                                   LLT NarrowTy) {
3848   const int NumDst = MI.getNumOperands() - 1;
3849   const Register SrcReg = MI.getOperand(NumDst).getReg();
3850   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3851   LLT SrcTy = MRI.getType(SrcReg);
3852 
3853   if (TypeIdx != 1 || NarrowTy == DstTy)
3854     return UnableToLegalize;
3855 
3856   // Requires compatible types. Otherwise SrcReg should have been defined by
3857   // merge-like instruction that would get artifact combined. Most likely
3858   // instruction that defines SrcReg has to perform more/fewer elements
3859   // legalization compatible with NarrowTy.
3860   assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3861   assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3862 
3863   if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3864       (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
3865     return UnableToLegalize;
3866 
3867   // This is most likely DstTy (smaller then register size) packed in SrcTy
3868   // (larger then register size) and since unmerge was not combined it will be
3869   // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
3870   // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
3871 
3872   // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
3873   //
3874   // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
3875   // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
3876   // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
3877   auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
3878   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3879   const int PartsPerUnmerge = NumDst / NumUnmerge;
3880 
3881   for (int I = 0; I != NumUnmerge; ++I) {
3882     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3883 
3884     for (int J = 0; J != PartsPerUnmerge; ++J)
3885       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3886     MIB.addUse(Unmerge.getReg(I));
3887   }
3888 
3889   MI.eraseFromParent();
3890   return Legalized;
3891 }
3892 
3893 LegalizerHelper::LegalizeResult
3894 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
3895                                           LLT NarrowTy) {
3896   Register DstReg = MI.getOperand(0).getReg();
3897   LLT DstTy = MRI.getType(DstReg);
3898   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
3899   // Requires compatible types. Otherwise user of DstReg did not perform unmerge
3900   // that should have been artifact combined. Most likely instruction that uses
3901   // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
3902   assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3903   assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3904   if (NarrowTy == SrcTy)
3905     return UnableToLegalize;
3906 
3907   // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
3908   // is for old mir tests. Since the changes to more/fewer elements it should no
3909   // longer be possible to generate MIR like this when starting from llvm-ir
3910   // because LCMTy approach was replaced with merge/unmerge to vector elements.
3911   if (TypeIdx == 1) {
3912     assert(SrcTy.isVector() && "Expected vector types");
3913     assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3914     if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3915         (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
3916       return UnableToLegalize;
3917     // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
3918     //
3919     // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
3920     // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
3921     // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
3922     // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
3923     // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
3924     // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
3925 
3926     SmallVector<Register, 8> Elts;
3927     LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
3928     for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
3929       auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
3930       for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
3931         Elts.push_back(Unmerge.getReg(j));
3932     }
3933 
3934     SmallVector<Register, 8> NarrowTyElts;
3935     unsigned NumNarrowTyElts = NarrowTy.getNumElements();
3936     unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
3937     for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
3938          ++i, Offset += NumNarrowTyElts) {
3939       ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
3940       NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
3941     }
3942 
3943     MIRBuilder.buildMerge(DstReg, NarrowTyElts);
3944     MI.eraseFromParent();
3945     return Legalized;
3946   }
3947 
3948   assert(TypeIdx == 0 && "Bad type index");
3949   if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
3950       (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
3951     return UnableToLegalize;
3952 
3953   // This is most likely SrcTy (smaller then register size) packed in DstTy
3954   // (larger then register size) and since merge was not combined it will be
3955   // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
3956   // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
3957 
3958   // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
3959   //
3960   // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
3961   // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
3962   // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
3963   SmallVector<Register, 8> NarrowTyElts;
3964   unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3965   unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
3966   unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
3967   for (unsigned i = 0; i < NumParts; ++i) {
3968     SmallVector<Register, 8> Sources;
3969     for (unsigned j = 0; j < NumElts; ++j)
3970       Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
3971     NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Sources).getReg(0));
3972   }
3973 
3974   MIRBuilder.buildMerge(DstReg, NarrowTyElts);
3975   MI.eraseFromParent();
3976   return Legalized;
3977 }
3978 
3979 LegalizerHelper::LegalizeResult
3980 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
3981                                                            unsigned TypeIdx,
3982                                                            LLT NarrowVecTy) {
3983   Register DstReg = MI.getOperand(0).getReg();
3984   Register SrcVec = MI.getOperand(1).getReg();
3985   Register InsertVal;
3986   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
3987 
3988   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
3989   if (IsInsert)
3990     InsertVal = MI.getOperand(2).getReg();
3991 
3992   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
3993 
3994   // TODO: Handle total scalarization case.
3995   if (!NarrowVecTy.isVector())
3996     return UnableToLegalize;
3997 
3998   LLT VecTy = MRI.getType(SrcVec);
3999 
4000   // If the index is a constant, we can really break this down as you would
4001   // expect, and index into the target size pieces.
4002   int64_t IdxVal;
4003   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
4004   if (MaybeCst) {
4005     IdxVal = MaybeCst->Value.getSExtValue();
4006     // Avoid out of bounds indexing the pieces.
4007     if (IdxVal >= VecTy.getNumElements()) {
4008       MIRBuilder.buildUndef(DstReg);
4009       MI.eraseFromParent();
4010       return Legalized;
4011     }
4012 
4013     SmallVector<Register, 8> VecParts;
4014     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4015 
4016     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4017     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4018                                     TargetOpcode::G_ANYEXT);
4019 
4020     unsigned NewNumElts = NarrowVecTy.getNumElements();
4021 
4022     LLT IdxTy = MRI.getType(Idx);
4023     int64_t PartIdx = IdxVal / NewNumElts;
4024     auto NewIdx =
4025         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4026 
4027     if (IsInsert) {
4028       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4029 
4030       // Use the adjusted index to insert into one of the subvectors.
4031       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4032           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4033       VecParts[PartIdx] = InsertPart.getReg(0);
4034 
4035       // Recombine the inserted subvector with the others to reform the result
4036       // vector.
4037       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4038     } else {
4039       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4040     }
4041 
4042     MI.eraseFromParent();
4043     return Legalized;
4044   }
4045 
4046   // With a variable index, we can't perform the operation in a smaller type, so
4047   // we're forced to expand this.
4048   //
4049   // TODO: We could emit a chain of compare/select to figure out which piece to
4050   // index.
4051   return lowerExtractInsertVectorElt(MI);
4052 }
4053 
4054 LegalizerHelper::LegalizeResult
4055 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4056                                       LLT NarrowTy) {
4057   // FIXME: Don't know how to handle secondary types yet.
4058   if (TypeIdx != 0)
4059     return UnableToLegalize;
4060 
4061   // This implementation doesn't work for atomics. Give up instead of doing
4062   // something invalid.
4063   if (LdStMI.isAtomic())
4064     return UnableToLegalize;
4065 
4066   bool IsLoad = isa<GLoad>(LdStMI);
4067   Register ValReg = LdStMI.getReg(0);
4068   Register AddrReg = LdStMI.getPointerReg();
4069   LLT ValTy = MRI.getType(ValReg);
4070 
4071   // FIXME: Do we need a distinct NarrowMemory legalize action?
4072   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4073     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4074     return UnableToLegalize;
4075   }
4076 
4077   int NumParts = -1;
4078   int NumLeftover = -1;
4079   LLT LeftoverTy;
4080   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4081   if (IsLoad) {
4082     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4083   } else {
4084     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4085                      NarrowLeftoverRegs)) {
4086       NumParts = NarrowRegs.size();
4087       NumLeftover = NarrowLeftoverRegs.size();
4088     }
4089   }
4090 
4091   if (NumParts == -1)
4092     return UnableToLegalize;
4093 
4094   LLT PtrTy = MRI.getType(AddrReg);
4095   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4096 
4097   unsigned TotalSize = ValTy.getSizeInBits();
4098 
4099   // Split the load/store into PartTy sized pieces starting at Offset. If this
4100   // is a load, return the new registers in ValRegs. For a store, each elements
4101   // of ValRegs should be PartTy. Returns the next offset that needs to be
4102   // handled.
4103   bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
4104   auto MMO = LdStMI.getMMO();
4105   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4106                              unsigned NumParts, unsigned Offset) -> unsigned {
4107     MachineFunction &MF = MIRBuilder.getMF();
4108     unsigned PartSize = PartTy.getSizeInBits();
4109     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4110          ++Idx) {
4111       unsigned ByteOffset = Offset / 8;
4112       Register NewAddrReg;
4113 
4114       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4115 
4116       MachineMemOperand *NewMMO =
4117           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4118 
4119       if (IsLoad) {
4120         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4121         ValRegs.push_back(Dst);
4122         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4123       } else {
4124         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4125       }
4126       Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
4127     }
4128 
4129     return Offset;
4130   };
4131 
4132   unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
4133   unsigned HandledOffset =
4134       splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
4135 
4136   // Handle the rest of the register if this isn't an even type breakdown.
4137   if (LeftoverTy.isValid())
4138     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
4139 
4140   if (IsLoad) {
4141     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4142                 LeftoverTy, NarrowLeftoverRegs);
4143   }
4144 
4145   LdStMI.eraseFromParent();
4146   return Legalized;
4147 }
4148 
4149 LegalizerHelper::LegalizeResult
4150 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4151                                      LLT NarrowTy) {
4152   using namespace TargetOpcode;
4153   GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
4154   unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
4155 
4156   switch (MI.getOpcode()) {
4157   case G_IMPLICIT_DEF:
4158   case G_TRUNC:
4159   case G_AND:
4160   case G_OR:
4161   case G_XOR:
4162   case G_ADD:
4163   case G_SUB:
4164   case G_MUL:
4165   case G_PTR_ADD:
4166   case G_SMULH:
4167   case G_UMULH:
4168   case G_FADD:
4169   case G_FMUL:
4170   case G_FSUB:
4171   case G_FNEG:
4172   case G_FABS:
4173   case G_FCANONICALIZE:
4174   case G_FDIV:
4175   case G_FREM:
4176   case G_FMA:
4177   case G_FMAD:
4178   case G_FPOW:
4179   case G_FEXP:
4180   case G_FEXP2:
4181   case G_FLOG:
4182   case G_FLOG2:
4183   case G_FLOG10:
4184   case G_FNEARBYINT:
4185   case G_FCEIL:
4186   case G_FFLOOR:
4187   case G_FRINT:
4188   case G_INTRINSIC_ROUND:
4189   case G_INTRINSIC_ROUNDEVEN:
4190   case G_INTRINSIC_TRUNC:
4191   case G_FCOS:
4192   case G_FSIN:
4193   case G_FSQRT:
4194   case G_BSWAP:
4195   case G_BITREVERSE:
4196   case G_SDIV:
4197   case G_UDIV:
4198   case G_SREM:
4199   case G_UREM:
4200   case G_SDIVREM:
4201   case G_UDIVREM:
4202   case G_SMIN:
4203   case G_SMAX:
4204   case G_UMIN:
4205   case G_UMAX:
4206   case G_ABS:
4207   case G_FMINNUM:
4208   case G_FMAXNUM:
4209   case G_FMINNUM_IEEE:
4210   case G_FMAXNUM_IEEE:
4211   case G_FMINIMUM:
4212   case G_FMAXIMUM:
4213   case G_FSHL:
4214   case G_FSHR:
4215   case G_ROTL:
4216   case G_ROTR:
4217   case G_FREEZE:
4218   case G_SADDSAT:
4219   case G_SSUBSAT:
4220   case G_UADDSAT:
4221   case G_USUBSAT:
4222   case G_UMULO:
4223   case G_SMULO:
4224   case G_SHL:
4225   case G_LSHR:
4226   case G_ASHR:
4227   case G_SSHLSAT:
4228   case G_USHLSAT:
4229   case G_CTLZ:
4230   case G_CTLZ_ZERO_UNDEF:
4231   case G_CTTZ:
4232   case G_CTTZ_ZERO_UNDEF:
4233   case G_CTPOP:
4234   case G_FCOPYSIGN:
4235   case G_ZEXT:
4236   case G_SEXT:
4237   case G_ANYEXT:
4238   case G_FPEXT:
4239   case G_FPTRUNC:
4240   case G_SITOFP:
4241   case G_UITOFP:
4242   case G_FPTOSI:
4243   case G_FPTOUI:
4244   case G_INTTOPTR:
4245   case G_PTRTOINT:
4246   case G_ADDRSPACE_CAST:
4247     return fewerElementsVectorMultiEltType(GMI, NumElts);
4248   case G_ICMP:
4249   case G_FCMP:
4250     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
4251   case G_SELECT:
4252     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4253       return fewerElementsVectorMultiEltType(GMI, NumElts);
4254     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
4255   case G_PHI:
4256     return fewerElementsVectorPhi(GMI, NumElts);
4257   case G_UNMERGE_VALUES:
4258     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4259   case G_BUILD_VECTOR:
4260     assert(TypeIdx == 0 && "not a vector type index");
4261     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4262   case G_CONCAT_VECTORS:
4263     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4264       return UnableToLegalize;
4265     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4266   case G_EXTRACT_VECTOR_ELT:
4267   case G_INSERT_VECTOR_ELT:
4268     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4269   case G_LOAD:
4270   case G_STORE:
4271     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4272   case G_SEXT_INREG:
4273     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
4274   GISEL_VECREDUCE_CASES_NONSEQ
4275     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4276   case G_SHUFFLE_VECTOR:
4277     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4278   default:
4279     return UnableToLegalize;
4280   }
4281 }
4282 
4283 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4284     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4285   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4286   if (TypeIdx != 0)
4287     return UnableToLegalize;
4288 
4289   Register DstReg = MI.getOperand(0).getReg();
4290   Register Src1Reg = MI.getOperand(1).getReg();
4291   Register Src2Reg = MI.getOperand(2).getReg();
4292   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4293   LLT DstTy = MRI.getType(DstReg);
4294   LLT Src1Ty = MRI.getType(Src1Reg);
4295   LLT Src2Ty = MRI.getType(Src2Reg);
4296   // The shuffle should be canonicalized by now.
4297   if (DstTy != Src1Ty)
4298     return UnableToLegalize;
4299   if (DstTy != Src2Ty)
4300     return UnableToLegalize;
4301 
4302   if (!isPowerOf2_32(DstTy.getNumElements()))
4303     return UnableToLegalize;
4304 
4305   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4306   // Further legalization attempts will be needed to do split further.
4307   NarrowTy =
4308       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4309   unsigned NewElts = NarrowTy.getNumElements();
4310 
4311   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4312   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4313   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4314   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4315                         SplitSrc2Regs[1]};
4316 
4317   Register Hi, Lo;
4318 
4319   // If Lo or Hi uses elements from at most two of the four input vectors, then
4320   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4321   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4322   SmallVector<int, 16> Ops;
4323   for (unsigned High = 0; High < 2; ++High) {
4324     Register &Output = High ? Hi : Lo;
4325 
4326     // Build a shuffle mask for the output, discovering on the fly which
4327     // input vectors to use as shuffle operands (recorded in InputUsed).
4328     // If building a suitable shuffle vector proves too hard, then bail
4329     // out with useBuildVector set.
4330     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4331     unsigned FirstMaskIdx = High * NewElts;
4332     bool UseBuildVector = false;
4333     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4334       // The mask element.  This indexes into the input.
4335       int Idx = Mask[FirstMaskIdx + MaskOffset];
4336 
4337       // The input vector this mask element indexes into.
4338       unsigned Input = (unsigned)Idx / NewElts;
4339 
4340       if (Input >= array_lengthof(Inputs)) {
4341         // The mask element does not index into any input vector.
4342         Ops.push_back(-1);
4343         continue;
4344       }
4345 
4346       // Turn the index into an offset from the start of the input vector.
4347       Idx -= Input * NewElts;
4348 
4349       // Find or create a shuffle vector operand to hold this input.
4350       unsigned OpNo;
4351       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4352         if (InputUsed[OpNo] == Input) {
4353           // This input vector is already an operand.
4354           break;
4355         } else if (InputUsed[OpNo] == -1U) {
4356           // Create a new operand for this input vector.
4357           InputUsed[OpNo] = Input;
4358           break;
4359         }
4360       }
4361 
4362       if (OpNo >= array_lengthof(InputUsed)) {
4363         // More than two input vectors used!  Give up on trying to create a
4364         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4365         UseBuildVector = true;
4366         break;
4367       }
4368 
4369       // Add the mask index for the new shuffle vector.
4370       Ops.push_back(Idx + OpNo * NewElts);
4371     }
4372 
4373     if (UseBuildVector) {
4374       LLT EltTy = NarrowTy.getElementType();
4375       SmallVector<Register, 16> SVOps;
4376 
4377       // Extract the input elements by hand.
4378       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4379         // The mask element.  This indexes into the input.
4380         int Idx = Mask[FirstMaskIdx + MaskOffset];
4381 
4382         // The input vector this mask element indexes into.
4383         unsigned Input = (unsigned)Idx / NewElts;
4384 
4385         if (Input >= array_lengthof(Inputs)) {
4386           // The mask element is "undef" or indexes off the end of the input.
4387           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4388           continue;
4389         }
4390 
4391         // Turn the index into an offset from the start of the input vector.
4392         Idx -= Input * NewElts;
4393 
4394         // Extract the vector element by hand.
4395         SVOps.push_back(MIRBuilder
4396                             .buildExtractVectorElement(
4397                                 EltTy, Inputs[Input],
4398                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4399                             .getReg(0));
4400       }
4401 
4402       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4403       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4404     } else if (InputUsed[0] == -1U) {
4405       // No input vectors were used! The result is undefined.
4406       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4407     } else {
4408       Register Op0 = Inputs[InputUsed[0]];
4409       // If only one input was used, use an undefined vector for the other.
4410       Register Op1 = InputUsed[1] == -1U
4411                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4412                          : Inputs[InputUsed[1]];
4413       // At least one input vector was used. Create a new shuffle vector.
4414       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4415     }
4416 
4417     Ops.clear();
4418   }
4419 
4420   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4421   MI.eraseFromParent();
4422   return Legalized;
4423 }
4424 
4425 static unsigned getScalarOpcForReduction(unsigned Opc) {
4426   unsigned ScalarOpc;
4427   switch (Opc) {
4428   case TargetOpcode::G_VECREDUCE_FADD:
4429     ScalarOpc = TargetOpcode::G_FADD;
4430     break;
4431   case TargetOpcode::G_VECREDUCE_FMUL:
4432     ScalarOpc = TargetOpcode::G_FMUL;
4433     break;
4434   case TargetOpcode::G_VECREDUCE_FMAX:
4435     ScalarOpc = TargetOpcode::G_FMAXNUM;
4436     break;
4437   case TargetOpcode::G_VECREDUCE_FMIN:
4438     ScalarOpc = TargetOpcode::G_FMINNUM;
4439     break;
4440   case TargetOpcode::G_VECREDUCE_ADD:
4441     ScalarOpc = TargetOpcode::G_ADD;
4442     break;
4443   case TargetOpcode::G_VECREDUCE_MUL:
4444     ScalarOpc = TargetOpcode::G_MUL;
4445     break;
4446   case TargetOpcode::G_VECREDUCE_AND:
4447     ScalarOpc = TargetOpcode::G_AND;
4448     break;
4449   case TargetOpcode::G_VECREDUCE_OR:
4450     ScalarOpc = TargetOpcode::G_OR;
4451     break;
4452   case TargetOpcode::G_VECREDUCE_XOR:
4453     ScalarOpc = TargetOpcode::G_XOR;
4454     break;
4455   case TargetOpcode::G_VECREDUCE_SMAX:
4456     ScalarOpc = TargetOpcode::G_SMAX;
4457     break;
4458   case TargetOpcode::G_VECREDUCE_SMIN:
4459     ScalarOpc = TargetOpcode::G_SMIN;
4460     break;
4461   case TargetOpcode::G_VECREDUCE_UMAX:
4462     ScalarOpc = TargetOpcode::G_UMAX;
4463     break;
4464   case TargetOpcode::G_VECREDUCE_UMIN:
4465     ScalarOpc = TargetOpcode::G_UMIN;
4466     break;
4467   default:
4468     llvm_unreachable("Unhandled reduction");
4469   }
4470   return ScalarOpc;
4471 }
4472 
4473 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4474     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4475   unsigned Opc = MI.getOpcode();
4476   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4477          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4478          "Sequential reductions not expected");
4479 
4480   if (TypeIdx != 1)
4481     return UnableToLegalize;
4482 
4483   // The semantics of the normal non-sequential reductions allow us to freely
4484   // re-associate the operation.
4485   Register SrcReg = MI.getOperand(1).getReg();
4486   LLT SrcTy = MRI.getType(SrcReg);
4487   Register DstReg = MI.getOperand(0).getReg();
4488   LLT DstTy = MRI.getType(DstReg);
4489 
4490   if (NarrowTy.isVector() &&
4491       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4492     return UnableToLegalize;
4493 
4494   unsigned ScalarOpc = getScalarOpcForReduction(Opc);
4495   SmallVector<Register> SplitSrcs;
4496   // If NarrowTy is a scalar then we're being asked to scalarize.
4497   const unsigned NumParts =
4498       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4499                           : SrcTy.getNumElements();
4500 
4501   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4502   if (NarrowTy.isScalar()) {
4503     if (DstTy != NarrowTy)
4504       return UnableToLegalize; // FIXME: handle implicit extensions.
4505 
4506     if (isPowerOf2_32(NumParts)) {
4507       // Generate a tree of scalar operations to reduce the critical path.
4508       SmallVector<Register> PartialResults;
4509       unsigned NumPartsLeft = NumParts;
4510       while (NumPartsLeft > 1) {
4511         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4512           PartialResults.emplace_back(
4513               MIRBuilder
4514                   .buildInstr(ScalarOpc, {NarrowTy},
4515                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4516                   .getReg(0));
4517         }
4518         SplitSrcs = PartialResults;
4519         PartialResults.clear();
4520         NumPartsLeft = SplitSrcs.size();
4521       }
4522       assert(SplitSrcs.size() == 1);
4523       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4524       MI.eraseFromParent();
4525       return Legalized;
4526     }
4527     // If we can't generate a tree, then just do sequential operations.
4528     Register Acc = SplitSrcs[0];
4529     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4530       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4531                 .getReg(0);
4532     MIRBuilder.buildCopy(DstReg, Acc);
4533     MI.eraseFromParent();
4534     return Legalized;
4535   }
4536   SmallVector<Register> PartialReductions;
4537   for (unsigned Part = 0; Part < NumParts; ++Part) {
4538     PartialReductions.push_back(
4539         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4540   }
4541 
4542 
4543   // If the types involved are powers of 2, we can generate intermediate vector
4544   // ops, before generating a final reduction operation.
4545   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4546       isPowerOf2_32(NarrowTy.getNumElements())) {
4547     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4548   }
4549 
4550   Register Acc = PartialReductions[0];
4551   for (unsigned Part = 1; Part < NumParts; ++Part) {
4552     if (Part == NumParts - 1) {
4553       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4554                             {Acc, PartialReductions[Part]});
4555     } else {
4556       Acc = MIRBuilder
4557                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4558                 .getReg(0);
4559     }
4560   }
4561   MI.eraseFromParent();
4562   return Legalized;
4563 }
4564 
4565 LegalizerHelper::LegalizeResult
4566 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4567                                         LLT SrcTy, LLT NarrowTy,
4568                                         unsigned ScalarOpc) {
4569   SmallVector<Register> SplitSrcs;
4570   // Split the sources into NarrowTy size pieces.
4571   extractParts(SrcReg, NarrowTy,
4572                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4573   // We're going to do a tree reduction using vector operations until we have
4574   // one NarrowTy size value left.
4575   while (SplitSrcs.size() > 1) {
4576     SmallVector<Register> PartialRdxs;
4577     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4578       Register LHS = SplitSrcs[Idx];
4579       Register RHS = SplitSrcs[Idx + 1];
4580       // Create the intermediate vector op.
4581       Register Res =
4582           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4583       PartialRdxs.push_back(Res);
4584     }
4585     SplitSrcs = std::move(PartialRdxs);
4586   }
4587   // Finally generate the requested NarrowTy based reduction.
4588   Observer.changingInstr(MI);
4589   MI.getOperand(1).setReg(SplitSrcs[0]);
4590   Observer.changedInstr(MI);
4591   return Legalized;
4592 }
4593 
4594 LegalizerHelper::LegalizeResult
4595 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4596                                              const LLT HalfTy, const LLT AmtTy) {
4597 
4598   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4599   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4600   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4601 
4602   if (Amt.isZero()) {
4603     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4604     MI.eraseFromParent();
4605     return Legalized;
4606   }
4607 
4608   LLT NVT = HalfTy;
4609   unsigned NVTBits = HalfTy.getSizeInBits();
4610   unsigned VTBits = 2 * NVTBits;
4611 
4612   SrcOp Lo(Register(0)), Hi(Register(0));
4613   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4614     if (Amt.ugt(VTBits)) {
4615       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4616     } else if (Amt.ugt(NVTBits)) {
4617       Lo = MIRBuilder.buildConstant(NVT, 0);
4618       Hi = MIRBuilder.buildShl(NVT, InL,
4619                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4620     } else if (Amt == NVTBits) {
4621       Lo = MIRBuilder.buildConstant(NVT, 0);
4622       Hi = InL;
4623     } else {
4624       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4625       auto OrLHS =
4626           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4627       auto OrRHS = MIRBuilder.buildLShr(
4628           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4629       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4630     }
4631   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4632     if (Amt.ugt(VTBits)) {
4633       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4634     } else if (Amt.ugt(NVTBits)) {
4635       Lo = MIRBuilder.buildLShr(NVT, InH,
4636                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4637       Hi = MIRBuilder.buildConstant(NVT, 0);
4638     } else if (Amt == NVTBits) {
4639       Lo = InH;
4640       Hi = MIRBuilder.buildConstant(NVT, 0);
4641     } else {
4642       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4643 
4644       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4645       auto OrRHS = MIRBuilder.buildShl(
4646           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4647 
4648       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4649       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4650     }
4651   } else {
4652     if (Amt.ugt(VTBits)) {
4653       Hi = Lo = MIRBuilder.buildAShr(
4654           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4655     } else if (Amt.ugt(NVTBits)) {
4656       Lo = MIRBuilder.buildAShr(NVT, InH,
4657                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4658       Hi = MIRBuilder.buildAShr(NVT, InH,
4659                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4660     } else if (Amt == NVTBits) {
4661       Lo = InH;
4662       Hi = MIRBuilder.buildAShr(NVT, InH,
4663                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4664     } else {
4665       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4666 
4667       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4668       auto OrRHS = MIRBuilder.buildShl(
4669           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4670 
4671       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4672       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4673     }
4674   }
4675 
4676   MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4677   MI.eraseFromParent();
4678 
4679   return Legalized;
4680 }
4681 
4682 // TODO: Optimize if constant shift amount.
4683 LegalizerHelper::LegalizeResult
4684 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4685                                    LLT RequestedTy) {
4686   if (TypeIdx == 1) {
4687     Observer.changingInstr(MI);
4688     narrowScalarSrc(MI, RequestedTy, 2);
4689     Observer.changedInstr(MI);
4690     return Legalized;
4691   }
4692 
4693   Register DstReg = MI.getOperand(0).getReg();
4694   LLT DstTy = MRI.getType(DstReg);
4695   if (DstTy.isVector())
4696     return UnableToLegalize;
4697 
4698   Register Amt = MI.getOperand(2).getReg();
4699   LLT ShiftAmtTy = MRI.getType(Amt);
4700   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4701   if (DstEltSize % 2 != 0)
4702     return UnableToLegalize;
4703 
4704   // Ignore the input type. We can only go to exactly half the size of the
4705   // input. If that isn't small enough, the resulting pieces will be further
4706   // legalized.
4707   const unsigned NewBitSize = DstEltSize / 2;
4708   const LLT HalfTy = LLT::scalar(NewBitSize);
4709   const LLT CondTy = LLT::scalar(1);
4710 
4711   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
4712     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4713                                        ShiftAmtTy);
4714   }
4715 
4716   // TODO: Expand with known bits.
4717 
4718   // Handle the fully general expansion by an unknown amount.
4719   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4720 
4721   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4722   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4723   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4724 
4725   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4726   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4727 
4728   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4729   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4730   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4731 
4732   Register ResultRegs[2];
4733   switch (MI.getOpcode()) {
4734   case TargetOpcode::G_SHL: {
4735     // Short: ShAmt < NewBitSize
4736     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4737 
4738     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4739     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4740     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4741 
4742     // Long: ShAmt >= NewBitSize
4743     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4744     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4745 
4746     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4747     auto Hi = MIRBuilder.buildSelect(
4748         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4749 
4750     ResultRegs[0] = Lo.getReg(0);
4751     ResultRegs[1] = Hi.getReg(0);
4752     break;
4753   }
4754   case TargetOpcode::G_LSHR:
4755   case TargetOpcode::G_ASHR: {
4756     // Short: ShAmt < NewBitSize
4757     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4758 
4759     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4760     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4761     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4762 
4763     // Long: ShAmt >= NewBitSize
4764     MachineInstrBuilder HiL;
4765     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4766       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4767     } else {
4768       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4769       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4770     }
4771     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4772                                      {InH, AmtExcess});     // Lo from Hi part.
4773 
4774     auto Lo = MIRBuilder.buildSelect(
4775         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4776 
4777     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4778 
4779     ResultRegs[0] = Lo.getReg(0);
4780     ResultRegs[1] = Hi.getReg(0);
4781     break;
4782   }
4783   default:
4784     llvm_unreachable("not a shift");
4785   }
4786 
4787   MIRBuilder.buildMerge(DstReg, ResultRegs);
4788   MI.eraseFromParent();
4789   return Legalized;
4790 }
4791 
4792 LegalizerHelper::LegalizeResult
4793 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
4794                                        LLT MoreTy) {
4795   assert(TypeIdx == 0 && "Expecting only Idx 0");
4796 
4797   Observer.changingInstr(MI);
4798   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4799     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
4800     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
4801     moreElementsVectorSrc(MI, MoreTy, I);
4802   }
4803 
4804   MachineBasicBlock &MBB = *MI.getParent();
4805   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
4806   moreElementsVectorDst(MI, MoreTy, 0);
4807   Observer.changedInstr(MI);
4808   return Legalized;
4809 }
4810 
4811 LegalizerHelper::LegalizeResult
4812 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
4813                                     LLT MoreTy) {
4814   unsigned Opc = MI.getOpcode();
4815   switch (Opc) {
4816   case TargetOpcode::G_IMPLICIT_DEF:
4817   case TargetOpcode::G_LOAD: {
4818     if (TypeIdx != 0)
4819       return UnableToLegalize;
4820     Observer.changingInstr(MI);
4821     moreElementsVectorDst(MI, MoreTy, 0);
4822     Observer.changedInstr(MI);
4823     return Legalized;
4824   }
4825   case TargetOpcode::G_STORE:
4826     if (TypeIdx != 0)
4827       return UnableToLegalize;
4828     Observer.changingInstr(MI);
4829     moreElementsVectorSrc(MI, MoreTy, 0);
4830     Observer.changedInstr(MI);
4831     return Legalized;
4832   case TargetOpcode::G_AND:
4833   case TargetOpcode::G_OR:
4834   case TargetOpcode::G_XOR:
4835   case TargetOpcode::G_ADD:
4836   case TargetOpcode::G_SUB:
4837   case TargetOpcode::G_MUL:
4838   case TargetOpcode::G_FADD:
4839   case TargetOpcode::G_FMUL:
4840   case TargetOpcode::G_UADDSAT:
4841   case TargetOpcode::G_USUBSAT:
4842   case TargetOpcode::G_SADDSAT:
4843   case TargetOpcode::G_SSUBSAT:
4844   case TargetOpcode::G_SMIN:
4845   case TargetOpcode::G_SMAX:
4846   case TargetOpcode::G_UMIN:
4847   case TargetOpcode::G_UMAX:
4848   case TargetOpcode::G_FMINNUM:
4849   case TargetOpcode::G_FMAXNUM:
4850   case TargetOpcode::G_FMINNUM_IEEE:
4851   case TargetOpcode::G_FMAXNUM_IEEE:
4852   case TargetOpcode::G_FMINIMUM:
4853   case TargetOpcode::G_FMAXIMUM: {
4854     Observer.changingInstr(MI);
4855     moreElementsVectorSrc(MI, MoreTy, 1);
4856     moreElementsVectorSrc(MI, MoreTy, 2);
4857     moreElementsVectorDst(MI, MoreTy, 0);
4858     Observer.changedInstr(MI);
4859     return Legalized;
4860   }
4861   case TargetOpcode::G_FMA:
4862   case TargetOpcode::G_FSHR:
4863   case TargetOpcode::G_FSHL: {
4864     Observer.changingInstr(MI);
4865     moreElementsVectorSrc(MI, MoreTy, 1);
4866     moreElementsVectorSrc(MI, MoreTy, 2);
4867     moreElementsVectorSrc(MI, MoreTy, 3);
4868     moreElementsVectorDst(MI, MoreTy, 0);
4869     Observer.changedInstr(MI);
4870     return Legalized;
4871   }
4872   case TargetOpcode::G_EXTRACT:
4873     if (TypeIdx != 1)
4874       return UnableToLegalize;
4875     Observer.changingInstr(MI);
4876     moreElementsVectorSrc(MI, MoreTy, 1);
4877     Observer.changedInstr(MI);
4878     return Legalized;
4879   case TargetOpcode::G_INSERT:
4880   case TargetOpcode::G_FREEZE:
4881   case TargetOpcode::G_FNEG:
4882   case TargetOpcode::G_FABS:
4883   case TargetOpcode::G_BSWAP:
4884   case TargetOpcode::G_FCANONICALIZE:
4885   case TargetOpcode::G_SEXT_INREG:
4886     if (TypeIdx != 0)
4887       return UnableToLegalize;
4888     Observer.changingInstr(MI);
4889     moreElementsVectorSrc(MI, MoreTy, 1);
4890     moreElementsVectorDst(MI, MoreTy, 0);
4891     Observer.changedInstr(MI);
4892     return Legalized;
4893   case TargetOpcode::G_SELECT:
4894     if (TypeIdx != 0)
4895       return UnableToLegalize;
4896     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4897       return UnableToLegalize;
4898 
4899     Observer.changingInstr(MI);
4900     moreElementsVectorSrc(MI, MoreTy, 2);
4901     moreElementsVectorSrc(MI, MoreTy, 3);
4902     moreElementsVectorDst(MI, MoreTy, 0);
4903     Observer.changedInstr(MI);
4904     return Legalized;
4905   case TargetOpcode::G_UNMERGE_VALUES:
4906     return UnableToLegalize;
4907   case TargetOpcode::G_PHI:
4908     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
4909   case TargetOpcode::G_SHUFFLE_VECTOR:
4910     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
4911   case TargetOpcode::G_BUILD_VECTOR: {
4912     SmallVector<SrcOp, 8> Elts;
4913     for (auto Op : MI.uses()) {
4914       Elts.push_back(Op.getReg());
4915     }
4916 
4917     for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
4918       Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
4919     }
4920 
4921     MIRBuilder.buildDeleteTrailingVectorElements(
4922         MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
4923     MI.eraseFromParent();
4924     return Legalized;
4925   }
4926   case TargetOpcode::G_TRUNC: {
4927     Observer.changingInstr(MI);
4928     moreElementsVectorSrc(MI, MoreTy, 1);
4929     moreElementsVectorDst(MI, MoreTy, 0);
4930     Observer.changedInstr(MI);
4931     return Legalized;
4932   }
4933   default:
4934     return UnableToLegalize;
4935   }
4936 }
4937 
4938 LegalizerHelper::LegalizeResult
4939 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
4940                                            unsigned int TypeIdx, LLT MoreTy) {
4941   if (TypeIdx != 0)
4942     return UnableToLegalize;
4943 
4944   Register DstReg = MI.getOperand(0).getReg();
4945   Register Src1Reg = MI.getOperand(1).getReg();
4946   Register Src2Reg = MI.getOperand(2).getReg();
4947   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4948   LLT DstTy = MRI.getType(DstReg);
4949   LLT Src1Ty = MRI.getType(Src1Reg);
4950   LLT Src2Ty = MRI.getType(Src2Reg);
4951   unsigned NumElts = DstTy.getNumElements();
4952   unsigned WidenNumElts = MoreTy.getNumElements();
4953 
4954   // Expect a canonicalized shuffle.
4955   if (DstTy != Src1Ty || DstTy != Src2Ty)
4956     return UnableToLegalize;
4957 
4958   moreElementsVectorSrc(MI, MoreTy, 1);
4959   moreElementsVectorSrc(MI, MoreTy, 2);
4960 
4961   // Adjust mask based on new input vector length.
4962   SmallVector<int, 16> NewMask;
4963   for (unsigned I = 0; I != NumElts; ++I) {
4964     int Idx = Mask[I];
4965     if (Idx < static_cast<int>(NumElts))
4966       NewMask.push_back(Idx);
4967     else
4968       NewMask.push_back(Idx - NumElts + WidenNumElts);
4969   }
4970   for (unsigned I = NumElts; I != WidenNumElts; ++I)
4971     NewMask.push_back(-1);
4972   moreElementsVectorDst(MI, MoreTy, 0);
4973   MIRBuilder.setInstrAndDebugLoc(MI);
4974   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
4975                                 MI.getOperand(1).getReg(),
4976                                 MI.getOperand(2).getReg(), NewMask);
4977   MI.eraseFromParent();
4978   return Legalized;
4979 }
4980 
4981 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
4982                                         ArrayRef<Register> Src1Regs,
4983                                         ArrayRef<Register> Src2Regs,
4984                                         LLT NarrowTy) {
4985   MachineIRBuilder &B = MIRBuilder;
4986   unsigned SrcParts = Src1Regs.size();
4987   unsigned DstParts = DstRegs.size();
4988 
4989   unsigned DstIdx = 0; // Low bits of the result.
4990   Register FactorSum =
4991       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
4992   DstRegs[DstIdx] = FactorSum;
4993 
4994   unsigned CarrySumPrevDstIdx;
4995   SmallVector<Register, 4> Factors;
4996 
4997   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
4998     // Collect low parts of muls for DstIdx.
4999     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5000          i <= std::min(DstIdx, SrcParts - 1); ++i) {
5001       MachineInstrBuilder Mul =
5002           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
5003       Factors.push_back(Mul.getReg(0));
5004     }
5005     // Collect high parts of muls from previous DstIdx.
5006     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5007          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5008       MachineInstrBuilder Umulh =
5009           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5010       Factors.push_back(Umulh.getReg(0));
5011     }
5012     // Add CarrySum from additions calculated for previous DstIdx.
5013     if (DstIdx != 1) {
5014       Factors.push_back(CarrySumPrevDstIdx);
5015     }
5016 
5017     Register CarrySum;
5018     // Add all factors and accumulate all carries into CarrySum.
5019     if (DstIdx != DstParts - 1) {
5020       MachineInstrBuilder Uaddo =
5021           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5022       FactorSum = Uaddo.getReg(0);
5023       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5024       for (unsigned i = 2; i < Factors.size(); ++i) {
5025         MachineInstrBuilder Uaddo =
5026             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5027         FactorSum = Uaddo.getReg(0);
5028         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5029         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5030       }
5031     } else {
5032       // Since value for the next index is not calculated, neither is CarrySum.
5033       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5034       for (unsigned i = 2; i < Factors.size(); ++i)
5035         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5036     }
5037 
5038     CarrySumPrevDstIdx = CarrySum;
5039     DstRegs[DstIdx] = FactorSum;
5040     Factors.clear();
5041   }
5042 }
5043 
5044 LegalizerHelper::LegalizeResult
5045 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5046                                     LLT NarrowTy) {
5047   if (TypeIdx != 0)
5048     return UnableToLegalize;
5049 
5050   Register DstReg = MI.getOperand(0).getReg();
5051   LLT DstType = MRI.getType(DstReg);
5052   // FIXME: add support for vector types
5053   if (DstType.isVector())
5054     return UnableToLegalize;
5055 
5056   unsigned Opcode = MI.getOpcode();
5057   unsigned OpO, OpE, OpF;
5058   switch (Opcode) {
5059   case TargetOpcode::G_SADDO:
5060   case TargetOpcode::G_SADDE:
5061   case TargetOpcode::G_UADDO:
5062   case TargetOpcode::G_UADDE:
5063   case TargetOpcode::G_ADD:
5064     OpO = TargetOpcode::G_UADDO;
5065     OpE = TargetOpcode::G_UADDE;
5066     OpF = TargetOpcode::G_UADDE;
5067     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5068       OpF = TargetOpcode::G_SADDE;
5069     break;
5070   case TargetOpcode::G_SSUBO:
5071   case TargetOpcode::G_SSUBE:
5072   case TargetOpcode::G_USUBO:
5073   case TargetOpcode::G_USUBE:
5074   case TargetOpcode::G_SUB:
5075     OpO = TargetOpcode::G_USUBO;
5076     OpE = TargetOpcode::G_USUBE;
5077     OpF = TargetOpcode::G_USUBE;
5078     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5079       OpF = TargetOpcode::G_SSUBE;
5080     break;
5081   default:
5082     llvm_unreachable("Unexpected add/sub opcode!");
5083   }
5084 
5085   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5086   unsigned NumDefs = MI.getNumExplicitDefs();
5087   Register Src1 = MI.getOperand(NumDefs).getReg();
5088   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5089   Register CarryDst, CarryIn;
5090   if (NumDefs == 2)
5091     CarryDst = MI.getOperand(1).getReg();
5092   if (MI.getNumOperands() == NumDefs + 3)
5093     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5094 
5095   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5096   LLT LeftoverTy, DummyTy;
5097   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5098   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5099   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5100 
5101   int NarrowParts = Src1Regs.size();
5102   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5103     Src1Regs.push_back(Src1Left[I]);
5104     Src2Regs.push_back(Src2Left[I]);
5105   }
5106   DstRegs.reserve(Src1Regs.size());
5107 
5108   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5109     Register DstReg =
5110         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5111     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5112     // Forward the final carry-out to the destination register
5113     if (i == e - 1 && CarryDst)
5114       CarryOut = CarryDst;
5115 
5116     if (!CarryIn) {
5117       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5118                             {Src1Regs[i], Src2Regs[i]});
5119     } else if (i == e - 1) {
5120       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5121                             {Src1Regs[i], Src2Regs[i], CarryIn});
5122     } else {
5123       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5124                             {Src1Regs[i], Src2Regs[i], CarryIn});
5125     }
5126 
5127     DstRegs.push_back(DstReg);
5128     CarryIn = CarryOut;
5129   }
5130   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5131               makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5132               makeArrayRef(DstRegs).drop_front(NarrowParts));
5133 
5134   MI.eraseFromParent();
5135   return Legalized;
5136 }
5137 
5138 LegalizerHelper::LegalizeResult
5139 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5140   Register DstReg = MI.getOperand(0).getReg();
5141   Register Src1 = MI.getOperand(1).getReg();
5142   Register Src2 = MI.getOperand(2).getReg();
5143 
5144   LLT Ty = MRI.getType(DstReg);
5145   if (Ty.isVector())
5146     return UnableToLegalize;
5147 
5148   unsigned Size = Ty.getSizeInBits();
5149   unsigned NarrowSize = NarrowTy.getSizeInBits();
5150   if (Size % NarrowSize != 0)
5151     return UnableToLegalize;
5152 
5153   unsigned NumParts = Size / NarrowSize;
5154   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5155   unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5156 
5157   SmallVector<Register, 2> Src1Parts, Src2Parts;
5158   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5159   extractParts(Src1, NarrowTy, NumParts, Src1Parts);
5160   extractParts(Src2, NarrowTy, NumParts, Src2Parts);
5161   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5162 
5163   // Take only high half of registers if this is high mul.
5164   ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5165   MIRBuilder.buildMerge(DstReg, DstRegs);
5166   MI.eraseFromParent();
5167   return Legalized;
5168 }
5169 
5170 LegalizerHelper::LegalizeResult
5171 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5172                                    LLT NarrowTy) {
5173   if (TypeIdx != 0)
5174     return UnableToLegalize;
5175 
5176   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5177 
5178   Register Src = MI.getOperand(1).getReg();
5179   LLT SrcTy = MRI.getType(Src);
5180 
5181   // If all finite floats fit into the narrowed integer type, we can just swap
5182   // out the result type. This is practically only useful for conversions from
5183   // half to at least 16-bits, so just handle the one case.
5184   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5185       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5186     return UnableToLegalize;
5187 
5188   Observer.changingInstr(MI);
5189   narrowScalarDst(MI, NarrowTy, 0,
5190                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5191   Observer.changedInstr(MI);
5192   return Legalized;
5193 }
5194 
5195 LegalizerHelper::LegalizeResult
5196 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5197                                      LLT NarrowTy) {
5198   if (TypeIdx != 1)
5199     return UnableToLegalize;
5200 
5201   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5202 
5203   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5204   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5205   // NarrowSize.
5206   if (SizeOp1 % NarrowSize != 0)
5207     return UnableToLegalize;
5208   int NumParts = SizeOp1 / NarrowSize;
5209 
5210   SmallVector<Register, 2> SrcRegs, DstRegs;
5211   SmallVector<uint64_t, 2> Indexes;
5212   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5213 
5214   Register OpReg = MI.getOperand(0).getReg();
5215   uint64_t OpStart = MI.getOperand(2).getImm();
5216   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5217   for (int i = 0; i < NumParts; ++i) {
5218     unsigned SrcStart = i * NarrowSize;
5219 
5220     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5221       // No part of the extract uses this subregister, ignore it.
5222       continue;
5223     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5224       // The entire subregister is extracted, forward the value.
5225       DstRegs.push_back(SrcRegs[i]);
5226       continue;
5227     }
5228 
5229     // OpSegStart is where this destination segment would start in OpReg if it
5230     // extended infinitely in both directions.
5231     int64_t ExtractOffset;
5232     uint64_t SegSize;
5233     if (OpStart < SrcStart) {
5234       ExtractOffset = 0;
5235       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5236     } else {
5237       ExtractOffset = OpStart - SrcStart;
5238       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5239     }
5240 
5241     Register SegReg = SrcRegs[i];
5242     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5243       // A genuine extract is needed.
5244       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5245       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5246     }
5247 
5248     DstRegs.push_back(SegReg);
5249   }
5250 
5251   Register DstReg = MI.getOperand(0).getReg();
5252   if (MRI.getType(DstReg).isVector())
5253     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5254   else if (DstRegs.size() > 1)
5255     MIRBuilder.buildMerge(DstReg, DstRegs);
5256   else
5257     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5258   MI.eraseFromParent();
5259   return Legalized;
5260 }
5261 
5262 LegalizerHelper::LegalizeResult
5263 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5264                                     LLT NarrowTy) {
5265   // FIXME: Don't know how to handle secondary types yet.
5266   if (TypeIdx != 0)
5267     return UnableToLegalize;
5268 
5269   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5270   SmallVector<uint64_t, 2> Indexes;
5271   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5272   LLT LeftoverTy;
5273   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5274                LeftoverRegs);
5275 
5276   for (Register Reg : LeftoverRegs)
5277     SrcRegs.push_back(Reg);
5278 
5279   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5280   Register OpReg = MI.getOperand(2).getReg();
5281   uint64_t OpStart = MI.getOperand(3).getImm();
5282   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5283   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5284     unsigned DstStart = I * NarrowSize;
5285 
5286     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5287       // The entire subregister is defined by this insert, forward the new
5288       // value.
5289       DstRegs.push_back(OpReg);
5290       continue;
5291     }
5292 
5293     Register SrcReg = SrcRegs[I];
5294     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5295       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5296       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5297       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5298     }
5299 
5300     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5301       // No part of the insert affects this subregister, forward the original.
5302       DstRegs.push_back(SrcReg);
5303       continue;
5304     }
5305 
5306     // OpSegStart is where this destination segment would start in OpReg if it
5307     // extended infinitely in both directions.
5308     int64_t ExtractOffset, InsertOffset;
5309     uint64_t SegSize;
5310     if (OpStart < DstStart) {
5311       InsertOffset = 0;
5312       ExtractOffset = DstStart - OpStart;
5313       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5314     } else {
5315       InsertOffset = OpStart - DstStart;
5316       ExtractOffset = 0;
5317       SegSize =
5318         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5319     }
5320 
5321     Register SegReg = OpReg;
5322     if (ExtractOffset != 0 || SegSize != OpSize) {
5323       // A genuine extract is needed.
5324       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5325       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5326     }
5327 
5328     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5329     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5330     DstRegs.push_back(DstReg);
5331   }
5332 
5333   uint64_t WideSize = DstRegs.size() * NarrowSize;
5334   Register DstReg = MI.getOperand(0).getReg();
5335   if (WideSize > RegTy.getSizeInBits()) {
5336     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5337     MIRBuilder.buildMerge(MergeReg, DstRegs);
5338     MIRBuilder.buildTrunc(DstReg, MergeReg);
5339   } else
5340     MIRBuilder.buildMerge(DstReg, DstRegs);
5341 
5342   MI.eraseFromParent();
5343   return Legalized;
5344 }
5345 
5346 LegalizerHelper::LegalizeResult
5347 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5348                                    LLT NarrowTy) {
5349   Register DstReg = MI.getOperand(0).getReg();
5350   LLT DstTy = MRI.getType(DstReg);
5351 
5352   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5353 
5354   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5355   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5356   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5357   LLT LeftoverTy;
5358   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5359                     Src0Regs, Src0LeftoverRegs))
5360     return UnableToLegalize;
5361 
5362   LLT Unused;
5363   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5364                     Src1Regs, Src1LeftoverRegs))
5365     llvm_unreachable("inconsistent extractParts result");
5366 
5367   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5368     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5369                                         {Src0Regs[I], Src1Regs[I]});
5370     DstRegs.push_back(Inst.getReg(0));
5371   }
5372 
5373   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5374     auto Inst = MIRBuilder.buildInstr(
5375       MI.getOpcode(),
5376       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5377     DstLeftoverRegs.push_back(Inst.getReg(0));
5378   }
5379 
5380   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5381               LeftoverTy, DstLeftoverRegs);
5382 
5383   MI.eraseFromParent();
5384   return Legalized;
5385 }
5386 
5387 LegalizerHelper::LegalizeResult
5388 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5389                                  LLT NarrowTy) {
5390   if (TypeIdx != 0)
5391     return UnableToLegalize;
5392 
5393   Register DstReg = MI.getOperand(0).getReg();
5394   Register SrcReg = MI.getOperand(1).getReg();
5395 
5396   LLT DstTy = MRI.getType(DstReg);
5397   if (DstTy.isVector())
5398     return UnableToLegalize;
5399 
5400   SmallVector<Register, 8> Parts;
5401   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5402   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5403   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5404 
5405   MI.eraseFromParent();
5406   return Legalized;
5407 }
5408 
5409 LegalizerHelper::LegalizeResult
5410 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5411                                     LLT NarrowTy) {
5412   if (TypeIdx != 0)
5413     return UnableToLegalize;
5414 
5415   Register CondReg = MI.getOperand(1).getReg();
5416   LLT CondTy = MRI.getType(CondReg);
5417   if (CondTy.isVector()) // TODO: Handle vselect
5418     return UnableToLegalize;
5419 
5420   Register DstReg = MI.getOperand(0).getReg();
5421   LLT DstTy = MRI.getType(DstReg);
5422 
5423   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5424   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5425   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5426   LLT LeftoverTy;
5427   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5428                     Src1Regs, Src1LeftoverRegs))
5429     return UnableToLegalize;
5430 
5431   LLT Unused;
5432   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5433                     Src2Regs, Src2LeftoverRegs))
5434     llvm_unreachable("inconsistent extractParts result");
5435 
5436   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5437     auto Select = MIRBuilder.buildSelect(NarrowTy,
5438                                          CondReg, Src1Regs[I], Src2Regs[I]);
5439     DstRegs.push_back(Select.getReg(0));
5440   }
5441 
5442   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5443     auto Select = MIRBuilder.buildSelect(
5444       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5445     DstLeftoverRegs.push_back(Select.getReg(0));
5446   }
5447 
5448   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5449               LeftoverTy, DstLeftoverRegs);
5450 
5451   MI.eraseFromParent();
5452   return Legalized;
5453 }
5454 
5455 LegalizerHelper::LegalizeResult
5456 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5457                                   LLT NarrowTy) {
5458   if (TypeIdx != 1)
5459     return UnableToLegalize;
5460 
5461   Register DstReg = MI.getOperand(0).getReg();
5462   Register SrcReg = MI.getOperand(1).getReg();
5463   LLT DstTy = MRI.getType(DstReg);
5464   LLT SrcTy = MRI.getType(SrcReg);
5465   unsigned NarrowSize = NarrowTy.getSizeInBits();
5466 
5467   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5468     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5469 
5470     MachineIRBuilder &B = MIRBuilder;
5471     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5472     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5473     auto C_0 = B.buildConstant(NarrowTy, 0);
5474     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5475                                 UnmergeSrc.getReg(1), C_0);
5476     auto LoCTLZ = IsUndef ?
5477       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5478       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5479     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5480     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5481     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5482     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5483 
5484     MI.eraseFromParent();
5485     return Legalized;
5486   }
5487 
5488   return UnableToLegalize;
5489 }
5490 
5491 LegalizerHelper::LegalizeResult
5492 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5493                                   LLT NarrowTy) {
5494   if (TypeIdx != 1)
5495     return UnableToLegalize;
5496 
5497   Register DstReg = MI.getOperand(0).getReg();
5498   Register SrcReg = MI.getOperand(1).getReg();
5499   LLT DstTy = MRI.getType(DstReg);
5500   LLT SrcTy = MRI.getType(SrcReg);
5501   unsigned NarrowSize = NarrowTy.getSizeInBits();
5502 
5503   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5504     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5505 
5506     MachineIRBuilder &B = MIRBuilder;
5507     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5508     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5509     auto C_0 = B.buildConstant(NarrowTy, 0);
5510     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5511                                 UnmergeSrc.getReg(0), C_0);
5512     auto HiCTTZ = IsUndef ?
5513       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5514       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5515     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5516     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5517     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5518     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5519 
5520     MI.eraseFromParent();
5521     return Legalized;
5522   }
5523 
5524   return UnableToLegalize;
5525 }
5526 
5527 LegalizerHelper::LegalizeResult
5528 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5529                                    LLT NarrowTy) {
5530   if (TypeIdx != 1)
5531     return UnableToLegalize;
5532 
5533   Register DstReg = MI.getOperand(0).getReg();
5534   LLT DstTy = MRI.getType(DstReg);
5535   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5536   unsigned NarrowSize = NarrowTy.getSizeInBits();
5537 
5538   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5539     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5540 
5541     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5542     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5543     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5544 
5545     MI.eraseFromParent();
5546     return Legalized;
5547   }
5548 
5549   return UnableToLegalize;
5550 }
5551 
5552 LegalizerHelper::LegalizeResult
5553 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5554   unsigned Opc = MI.getOpcode();
5555   const auto &TII = MIRBuilder.getTII();
5556   auto isSupported = [this](const LegalityQuery &Q) {
5557     auto QAction = LI.getAction(Q).Action;
5558     return QAction == Legal || QAction == Libcall || QAction == Custom;
5559   };
5560   switch (Opc) {
5561   default:
5562     return UnableToLegalize;
5563   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5564     // This trivially expands to CTLZ.
5565     Observer.changingInstr(MI);
5566     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5567     Observer.changedInstr(MI);
5568     return Legalized;
5569   }
5570   case TargetOpcode::G_CTLZ: {
5571     Register DstReg = MI.getOperand(0).getReg();
5572     Register SrcReg = MI.getOperand(1).getReg();
5573     LLT DstTy = MRI.getType(DstReg);
5574     LLT SrcTy = MRI.getType(SrcReg);
5575     unsigned Len = SrcTy.getSizeInBits();
5576 
5577     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5578       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5579       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5580       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5581       auto ICmp = MIRBuilder.buildICmp(
5582           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5583       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5584       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5585       MI.eraseFromParent();
5586       return Legalized;
5587     }
5588     // for now, we do this:
5589     // NewLen = NextPowerOf2(Len);
5590     // x = x | (x >> 1);
5591     // x = x | (x >> 2);
5592     // ...
5593     // x = x | (x >>16);
5594     // x = x | (x >>32); // for 64-bit input
5595     // Upto NewLen/2
5596     // return Len - popcount(x);
5597     //
5598     // Ref: "Hacker's Delight" by Henry Warren
5599     Register Op = SrcReg;
5600     unsigned NewLen = PowerOf2Ceil(Len);
5601     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5602       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5603       auto MIBOp = MIRBuilder.buildOr(
5604           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5605       Op = MIBOp.getReg(0);
5606     }
5607     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5608     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5609                         MIBPop);
5610     MI.eraseFromParent();
5611     return Legalized;
5612   }
5613   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5614     // This trivially expands to CTTZ.
5615     Observer.changingInstr(MI);
5616     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5617     Observer.changedInstr(MI);
5618     return Legalized;
5619   }
5620   case TargetOpcode::G_CTTZ: {
5621     Register DstReg = MI.getOperand(0).getReg();
5622     Register SrcReg = MI.getOperand(1).getReg();
5623     LLT DstTy = MRI.getType(DstReg);
5624     LLT SrcTy = MRI.getType(SrcReg);
5625 
5626     unsigned Len = SrcTy.getSizeInBits();
5627     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5628       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5629       // zero.
5630       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5631       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5632       auto ICmp = MIRBuilder.buildICmp(
5633           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5634       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5635       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5636       MI.eraseFromParent();
5637       return Legalized;
5638     }
5639     // for now, we use: { return popcount(~x & (x - 1)); }
5640     // unless the target has ctlz but not ctpop, in which case we use:
5641     // { return 32 - nlz(~x & (x-1)); }
5642     // Ref: "Hacker's Delight" by Henry Warren
5643     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5644     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5645     auto MIBTmp = MIRBuilder.buildAnd(
5646         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5647     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5648         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5649       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5650       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5651                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5652       MI.eraseFromParent();
5653       return Legalized;
5654     }
5655     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5656     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5657     return Legalized;
5658   }
5659   case TargetOpcode::G_CTPOP: {
5660     Register SrcReg = MI.getOperand(1).getReg();
5661     LLT Ty = MRI.getType(SrcReg);
5662     unsigned Size = Ty.getSizeInBits();
5663     MachineIRBuilder &B = MIRBuilder;
5664 
5665     // Count set bits in blocks of 2 bits. Default approach would be
5666     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5667     // We use following formula instead:
5668     // B2Count = val - { (val >> 1) & 0x55555555 }
5669     // since it gives same result in blocks of 2 with one instruction less.
5670     auto C_1 = B.buildConstant(Ty, 1);
5671     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5672     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5673     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5674     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5675     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5676 
5677     // In order to get count in blocks of 4 add values from adjacent block of 2.
5678     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5679     auto C_2 = B.buildConstant(Ty, 2);
5680     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5681     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5682     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5683     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5684     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5685     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5686 
5687     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5688     // addition since count value sits in range {0,...,8} and 4 bits are enough
5689     // to hold such binary values. After addition high 4 bits still hold count
5690     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5691     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5692     auto C_4 = B.buildConstant(Ty, 4);
5693     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5694     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5695     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5696     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5697     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5698 
5699     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5700     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5701     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5702     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5703     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5704 
5705     // Shift count result from 8 high bits to low bits.
5706     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5707     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5708 
5709     MI.eraseFromParent();
5710     return Legalized;
5711   }
5712   }
5713 }
5714 
5715 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5716 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5717                                         Register Reg, unsigned BW) {
5718   return matchUnaryPredicate(
5719       MRI, Reg,
5720       [=](const Constant *C) {
5721         // Null constant here means an undef.
5722         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5723         return !CI || CI->getValue().urem(BW) != 0;
5724       },
5725       /*AllowUndefs*/ true);
5726 }
5727 
5728 LegalizerHelper::LegalizeResult
5729 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5730   Register Dst = MI.getOperand(0).getReg();
5731   Register X = MI.getOperand(1).getReg();
5732   Register Y = MI.getOperand(2).getReg();
5733   Register Z = MI.getOperand(3).getReg();
5734   LLT Ty = MRI.getType(Dst);
5735   LLT ShTy = MRI.getType(Z);
5736 
5737   unsigned BW = Ty.getScalarSizeInBits();
5738 
5739   if (!isPowerOf2_32(BW))
5740     return UnableToLegalize;
5741 
5742   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5743   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5744 
5745   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5746     // fshl X, Y, Z -> fshr X, Y, -Z
5747     // fshr X, Y, Z -> fshl X, Y, -Z
5748     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5749     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5750   } else {
5751     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5752     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5753     auto One = MIRBuilder.buildConstant(ShTy, 1);
5754     if (IsFSHL) {
5755       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5756       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5757     } else {
5758       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5759       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5760     }
5761 
5762     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5763   }
5764 
5765   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5766   MI.eraseFromParent();
5767   return Legalized;
5768 }
5769 
5770 LegalizerHelper::LegalizeResult
5771 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5772   Register Dst = MI.getOperand(0).getReg();
5773   Register X = MI.getOperand(1).getReg();
5774   Register Y = MI.getOperand(2).getReg();
5775   Register Z = MI.getOperand(3).getReg();
5776   LLT Ty = MRI.getType(Dst);
5777   LLT ShTy = MRI.getType(Z);
5778 
5779   const unsigned BW = Ty.getScalarSizeInBits();
5780   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5781 
5782   Register ShX, ShY;
5783   Register ShAmt, InvShAmt;
5784 
5785   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5786   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5787     // fshl: X << C | Y >> (BW - C)
5788     // fshr: X << (BW - C) | Y >> C
5789     // where C = Z % BW is not zero
5790     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5791     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5792     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5793     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5794     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5795   } else {
5796     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5797     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5798     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5799     if (isPowerOf2_32(BW)) {
5800       // Z % BW -> Z & (BW - 1)
5801       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5802       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5803       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5804       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
5805     } else {
5806       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5807       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5808       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
5809     }
5810 
5811     auto One = MIRBuilder.buildConstant(ShTy, 1);
5812     if (IsFSHL) {
5813       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
5814       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
5815       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
5816     } else {
5817       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
5818       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
5819       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
5820     }
5821   }
5822 
5823   MIRBuilder.buildOr(Dst, ShX, ShY);
5824   MI.eraseFromParent();
5825   return Legalized;
5826 }
5827 
5828 LegalizerHelper::LegalizeResult
5829 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
5830   // These operations approximately do the following (while avoiding undefined
5831   // shifts by BW):
5832   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
5833   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
5834   Register Dst = MI.getOperand(0).getReg();
5835   LLT Ty = MRI.getType(Dst);
5836   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
5837 
5838   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5839   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5840 
5841   // TODO: Use smarter heuristic that accounts for vector legalization.
5842   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
5843     return lowerFunnelShiftAsShifts(MI);
5844 
5845   // This only works for powers of 2, fallback to shifts if it fails.
5846   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
5847   if (Result == UnableToLegalize)
5848     return lowerFunnelShiftAsShifts(MI);
5849   return Result;
5850 }
5851 
5852 LegalizerHelper::LegalizeResult
5853 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
5854   Register Dst = MI.getOperand(0).getReg();
5855   Register Src = MI.getOperand(1).getReg();
5856   Register Amt = MI.getOperand(2).getReg();
5857   LLT AmtTy = MRI.getType(Amt);
5858   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5859   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5860   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5861   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5862   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
5863   MI.eraseFromParent();
5864   return Legalized;
5865 }
5866 
5867 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
5868   Register Dst = MI.getOperand(0).getReg();
5869   Register Src = MI.getOperand(1).getReg();
5870   Register Amt = MI.getOperand(2).getReg();
5871   LLT DstTy = MRI.getType(Dst);
5872   LLT SrcTy = MRI.getType(Src);
5873   LLT AmtTy = MRI.getType(Amt);
5874 
5875   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
5876   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5877 
5878   MIRBuilder.setInstrAndDebugLoc(MI);
5879 
5880   // If a rotate in the other direction is supported, use it.
5881   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5882   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
5883       isPowerOf2_32(EltSizeInBits))
5884     return lowerRotateWithReverseRotate(MI);
5885 
5886   // If a funnel shift is supported, use it.
5887   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5888   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5889   bool IsFShLegal = false;
5890   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
5891       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
5892     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
5893                                 Register R3) {
5894       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
5895       MI.eraseFromParent();
5896       return Legalized;
5897     };
5898     // If a funnel shift in the other direction is supported, use it.
5899     if (IsFShLegal) {
5900       return buildFunnelShift(FShOpc, Dst, Src, Amt);
5901     } else if (isPowerOf2_32(EltSizeInBits)) {
5902       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
5903       return buildFunnelShift(RevFsh, Dst, Src, Amt);
5904     }
5905   }
5906 
5907   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5908   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
5909   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
5910   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
5911   Register ShVal;
5912   Register RevShiftVal;
5913   if (isPowerOf2_32(EltSizeInBits)) {
5914     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
5915     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
5916     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5917     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
5918     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5919     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
5920     RevShiftVal =
5921         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
5922   } else {
5923     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
5924     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
5925     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
5926     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
5927     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5928     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
5929     auto One = MIRBuilder.buildConstant(AmtTy, 1);
5930     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
5931     RevShiftVal =
5932         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
5933   }
5934   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
5935   MI.eraseFromParent();
5936   return Legalized;
5937 }
5938 
5939 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
5940 // representation.
5941 LegalizerHelper::LegalizeResult
5942 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
5943   Register Dst = MI.getOperand(0).getReg();
5944   Register Src = MI.getOperand(1).getReg();
5945   const LLT S64 = LLT::scalar(64);
5946   const LLT S32 = LLT::scalar(32);
5947   const LLT S1 = LLT::scalar(1);
5948 
5949   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
5950 
5951   // unsigned cul2f(ulong u) {
5952   //   uint lz = clz(u);
5953   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
5954   //   u = (u << lz) & 0x7fffffffffffffffUL;
5955   //   ulong t = u & 0xffffffffffUL;
5956   //   uint v = (e << 23) | (uint)(u >> 40);
5957   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
5958   //   return as_float(v + r);
5959   // }
5960 
5961   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
5962   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
5963 
5964   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
5965 
5966   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
5967   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
5968 
5969   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
5970   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
5971 
5972   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
5973   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
5974 
5975   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
5976 
5977   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
5978   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
5979 
5980   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
5981   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
5982   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
5983 
5984   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
5985   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
5986   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
5987   auto One = MIRBuilder.buildConstant(S32, 1);
5988 
5989   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
5990   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
5991   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
5992   MIRBuilder.buildAdd(Dst, V, R);
5993 
5994   MI.eraseFromParent();
5995   return Legalized;
5996 }
5997 
5998 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
5999   Register Dst = MI.getOperand(0).getReg();
6000   Register Src = MI.getOperand(1).getReg();
6001   LLT DstTy = MRI.getType(Dst);
6002   LLT SrcTy = MRI.getType(Src);
6003 
6004   if (SrcTy == LLT::scalar(1)) {
6005     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6006     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6007     MIRBuilder.buildSelect(Dst, Src, True, False);
6008     MI.eraseFromParent();
6009     return Legalized;
6010   }
6011 
6012   if (SrcTy != LLT::scalar(64))
6013     return UnableToLegalize;
6014 
6015   if (DstTy == LLT::scalar(32)) {
6016     // TODO: SelectionDAG has several alternative expansions to port which may
6017     // be more reasonble depending on the available instructions. If a target
6018     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6019     // intermediate type, this is probably worse.
6020     return lowerU64ToF32BitOps(MI);
6021   }
6022 
6023   return UnableToLegalize;
6024 }
6025 
6026 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6027   Register Dst = MI.getOperand(0).getReg();
6028   Register Src = MI.getOperand(1).getReg();
6029   LLT DstTy = MRI.getType(Dst);
6030   LLT SrcTy = MRI.getType(Src);
6031 
6032   const LLT S64 = LLT::scalar(64);
6033   const LLT S32 = LLT::scalar(32);
6034   const LLT S1 = LLT::scalar(1);
6035 
6036   if (SrcTy == S1) {
6037     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6038     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6039     MIRBuilder.buildSelect(Dst, Src, True, False);
6040     MI.eraseFromParent();
6041     return Legalized;
6042   }
6043 
6044   if (SrcTy != S64)
6045     return UnableToLegalize;
6046 
6047   if (DstTy == S32) {
6048     // signed cl2f(long l) {
6049     //   long s = l >> 63;
6050     //   float r = cul2f((l + s) ^ s);
6051     //   return s ? -r : r;
6052     // }
6053     Register L = Src;
6054     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6055     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6056 
6057     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6058     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6059     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6060 
6061     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6062     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6063                                             MIRBuilder.buildConstant(S64, 0));
6064     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6065     MI.eraseFromParent();
6066     return Legalized;
6067   }
6068 
6069   return UnableToLegalize;
6070 }
6071 
6072 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6073   Register Dst = MI.getOperand(0).getReg();
6074   Register Src = MI.getOperand(1).getReg();
6075   LLT DstTy = MRI.getType(Dst);
6076   LLT SrcTy = MRI.getType(Src);
6077   const LLT S64 = LLT::scalar(64);
6078   const LLT S32 = LLT::scalar(32);
6079 
6080   if (SrcTy != S64 && SrcTy != S32)
6081     return UnableToLegalize;
6082   if (DstTy != S32 && DstTy != S64)
6083     return UnableToLegalize;
6084 
6085   // FPTOSI gives same result as FPTOUI for positive signed integers.
6086   // FPTOUI needs to deal with fp values that convert to unsigned integers
6087   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6088 
6089   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6090   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6091                                                 : APFloat::IEEEdouble(),
6092                     APInt::getZero(SrcTy.getSizeInBits()));
6093   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6094 
6095   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6096 
6097   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6098   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6099   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6100   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6101   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6102   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6103   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6104 
6105   const LLT S1 = LLT::scalar(1);
6106 
6107   MachineInstrBuilder FCMP =
6108       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6109   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6110 
6111   MI.eraseFromParent();
6112   return Legalized;
6113 }
6114 
6115 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6116   Register Dst = MI.getOperand(0).getReg();
6117   Register Src = MI.getOperand(1).getReg();
6118   LLT DstTy = MRI.getType(Dst);
6119   LLT SrcTy = MRI.getType(Src);
6120   const LLT S64 = LLT::scalar(64);
6121   const LLT S32 = LLT::scalar(32);
6122 
6123   // FIXME: Only f32 to i64 conversions are supported.
6124   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6125     return UnableToLegalize;
6126 
6127   // Expand f32 -> i64 conversion
6128   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6129   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6130 
6131   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6132 
6133   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6134   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6135 
6136   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6137   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6138 
6139   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6140                                            APInt::getSignMask(SrcEltBits));
6141   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6142   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6143   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6144   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6145 
6146   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6147   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6148   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6149 
6150   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6151   R = MIRBuilder.buildZExt(DstTy, R);
6152 
6153   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6154   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6155   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6156   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6157 
6158   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6159   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6160 
6161   const LLT S1 = LLT::scalar(1);
6162   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6163                                     S1, Exponent, ExponentLoBit);
6164 
6165   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6166 
6167   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6168   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6169 
6170   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6171 
6172   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6173                                           S1, Exponent, ZeroSrcTy);
6174 
6175   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6176   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6177 
6178   MI.eraseFromParent();
6179   return Legalized;
6180 }
6181 
6182 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6183 LegalizerHelper::LegalizeResult
6184 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6185   Register Dst = MI.getOperand(0).getReg();
6186   Register Src = MI.getOperand(1).getReg();
6187 
6188   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6189     return UnableToLegalize;
6190 
6191   const unsigned ExpMask = 0x7ff;
6192   const unsigned ExpBiasf64 = 1023;
6193   const unsigned ExpBiasf16 = 15;
6194   const LLT S32 = LLT::scalar(32);
6195   const LLT S1 = LLT::scalar(1);
6196 
6197   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6198   Register U = Unmerge.getReg(0);
6199   Register UH = Unmerge.getReg(1);
6200 
6201   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6202   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6203 
6204   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6205   // add the f16 bias (15) to get the biased exponent for the f16 format.
6206   E = MIRBuilder.buildAdd(
6207     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6208 
6209   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6210   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6211 
6212   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6213                                        MIRBuilder.buildConstant(S32, 0x1ff));
6214   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6215 
6216   auto Zero = MIRBuilder.buildConstant(S32, 0);
6217   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6218   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6219   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6220 
6221   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6222   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6223   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6224   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6225 
6226   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6227   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6228 
6229   // N = M | (E << 12);
6230   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6231   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6232 
6233   // B = clamp(1-E, 0, 13);
6234   auto One = MIRBuilder.buildConstant(S32, 1);
6235   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6236   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6237   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6238 
6239   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6240                                        MIRBuilder.buildConstant(S32, 0x1000));
6241 
6242   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6243   auto D0 = MIRBuilder.buildShl(S32, D, B);
6244 
6245   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6246                                              D0, SigSetHigh);
6247   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6248   D = MIRBuilder.buildOr(S32, D, D1);
6249 
6250   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6251   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6252 
6253   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6254   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6255 
6256   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6257                                        MIRBuilder.buildConstant(S32, 3));
6258   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6259 
6260   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6261                                        MIRBuilder.buildConstant(S32, 5));
6262   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6263 
6264   V1 = MIRBuilder.buildOr(S32, V0, V1);
6265   V = MIRBuilder.buildAdd(S32, V, V1);
6266 
6267   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6268                                        E, MIRBuilder.buildConstant(S32, 30));
6269   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6270                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6271 
6272   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6273                                          E, MIRBuilder.buildConstant(S32, 1039));
6274   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6275 
6276   // Extract the sign bit.
6277   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6278   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6279 
6280   // Insert the sign bit
6281   V = MIRBuilder.buildOr(S32, Sign, V);
6282 
6283   MIRBuilder.buildTrunc(Dst, V);
6284   MI.eraseFromParent();
6285   return Legalized;
6286 }
6287 
6288 LegalizerHelper::LegalizeResult
6289 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6290   Register Dst = MI.getOperand(0).getReg();
6291   Register Src = MI.getOperand(1).getReg();
6292 
6293   LLT DstTy = MRI.getType(Dst);
6294   LLT SrcTy = MRI.getType(Src);
6295   const LLT S64 = LLT::scalar(64);
6296   const LLT S16 = LLT::scalar(16);
6297 
6298   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6299     return lowerFPTRUNC_F64_TO_F16(MI);
6300 
6301   return UnableToLegalize;
6302 }
6303 
6304 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6305 // multiplication tree.
6306 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6307   Register Dst = MI.getOperand(0).getReg();
6308   Register Src0 = MI.getOperand(1).getReg();
6309   Register Src1 = MI.getOperand(2).getReg();
6310   LLT Ty = MRI.getType(Dst);
6311 
6312   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6313   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6314   MI.eraseFromParent();
6315   return Legalized;
6316 }
6317 
6318 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6319   switch (Opc) {
6320   case TargetOpcode::G_SMIN:
6321     return CmpInst::ICMP_SLT;
6322   case TargetOpcode::G_SMAX:
6323     return CmpInst::ICMP_SGT;
6324   case TargetOpcode::G_UMIN:
6325     return CmpInst::ICMP_ULT;
6326   case TargetOpcode::G_UMAX:
6327     return CmpInst::ICMP_UGT;
6328   default:
6329     llvm_unreachable("not in integer min/max");
6330   }
6331 }
6332 
6333 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6334   Register Dst = MI.getOperand(0).getReg();
6335   Register Src0 = MI.getOperand(1).getReg();
6336   Register Src1 = MI.getOperand(2).getReg();
6337 
6338   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6339   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6340 
6341   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6342   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6343 
6344   MI.eraseFromParent();
6345   return Legalized;
6346 }
6347 
6348 LegalizerHelper::LegalizeResult
6349 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6350   Register Dst = MI.getOperand(0).getReg();
6351   Register Src0 = MI.getOperand(1).getReg();
6352   Register Src1 = MI.getOperand(2).getReg();
6353 
6354   const LLT Src0Ty = MRI.getType(Src0);
6355   const LLT Src1Ty = MRI.getType(Src1);
6356 
6357   const int Src0Size = Src0Ty.getScalarSizeInBits();
6358   const int Src1Size = Src1Ty.getScalarSizeInBits();
6359 
6360   auto SignBitMask = MIRBuilder.buildConstant(
6361     Src0Ty, APInt::getSignMask(Src0Size));
6362 
6363   auto NotSignBitMask = MIRBuilder.buildConstant(
6364     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6365 
6366   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6367   Register And1;
6368   if (Src0Ty == Src1Ty) {
6369     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6370   } else if (Src0Size > Src1Size) {
6371     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6372     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6373     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6374     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6375   } else {
6376     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6377     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6378     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6379     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6380   }
6381 
6382   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6383   // constants are a nan and -0.0, but the final result should preserve
6384   // everything.
6385   unsigned Flags = MI.getFlags();
6386   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6387 
6388   MI.eraseFromParent();
6389   return Legalized;
6390 }
6391 
6392 LegalizerHelper::LegalizeResult
6393 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6394   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6395     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6396 
6397   Register Dst = MI.getOperand(0).getReg();
6398   Register Src0 = MI.getOperand(1).getReg();
6399   Register Src1 = MI.getOperand(2).getReg();
6400   LLT Ty = MRI.getType(Dst);
6401 
6402   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6403     // Insert canonicalizes if it's possible we need to quiet to get correct
6404     // sNaN behavior.
6405 
6406     // Note this must be done here, and not as an optimization combine in the
6407     // absence of a dedicate quiet-snan instruction as we're using an
6408     // omni-purpose G_FCANONICALIZE.
6409     if (!isKnownNeverSNaN(Src0, MRI))
6410       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6411 
6412     if (!isKnownNeverSNaN(Src1, MRI))
6413       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6414   }
6415 
6416   // If there are no nans, it's safe to simply replace this with the non-IEEE
6417   // version.
6418   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6419   MI.eraseFromParent();
6420   return Legalized;
6421 }
6422 
6423 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6424   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6425   Register DstReg = MI.getOperand(0).getReg();
6426   LLT Ty = MRI.getType(DstReg);
6427   unsigned Flags = MI.getFlags();
6428 
6429   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6430                                   Flags);
6431   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6432   MI.eraseFromParent();
6433   return Legalized;
6434 }
6435 
6436 LegalizerHelper::LegalizeResult
6437 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6438   Register DstReg = MI.getOperand(0).getReg();
6439   Register X = MI.getOperand(1).getReg();
6440   const unsigned Flags = MI.getFlags();
6441   const LLT Ty = MRI.getType(DstReg);
6442   const LLT CondTy = Ty.changeElementSize(1);
6443 
6444   // round(x) =>
6445   //  t = trunc(x);
6446   //  d = fabs(x - t);
6447   //  o = copysign(1.0f, x);
6448   //  return t + (d >= 0.5 ? o : 0.0);
6449 
6450   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6451 
6452   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6453   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6454   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6455   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6456   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6457   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6458 
6459   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6460                                   Flags);
6461   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6462 
6463   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6464 
6465   MI.eraseFromParent();
6466   return Legalized;
6467 }
6468 
6469 LegalizerHelper::LegalizeResult
6470 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6471   Register DstReg = MI.getOperand(0).getReg();
6472   Register SrcReg = MI.getOperand(1).getReg();
6473   unsigned Flags = MI.getFlags();
6474   LLT Ty = MRI.getType(DstReg);
6475   const LLT CondTy = Ty.changeElementSize(1);
6476 
6477   // result = trunc(src);
6478   // if (src < 0.0 && src != result)
6479   //   result += -1.0.
6480 
6481   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6482   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6483 
6484   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6485                                   SrcReg, Zero, Flags);
6486   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6487                                       SrcReg, Trunc, Flags);
6488   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6489   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6490 
6491   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6492   MI.eraseFromParent();
6493   return Legalized;
6494 }
6495 
6496 LegalizerHelper::LegalizeResult
6497 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6498   const unsigned NumOps = MI.getNumOperands();
6499   Register DstReg = MI.getOperand(0).getReg();
6500   Register Src0Reg = MI.getOperand(1).getReg();
6501   LLT DstTy = MRI.getType(DstReg);
6502   LLT SrcTy = MRI.getType(Src0Reg);
6503   unsigned PartSize = SrcTy.getSizeInBits();
6504 
6505   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6506   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6507 
6508   for (unsigned I = 2; I != NumOps; ++I) {
6509     const unsigned Offset = (I - 1) * PartSize;
6510 
6511     Register SrcReg = MI.getOperand(I).getReg();
6512     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6513 
6514     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6515       MRI.createGenericVirtualRegister(WideTy);
6516 
6517     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6518     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6519     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6520     ResultReg = NextResult;
6521   }
6522 
6523   if (DstTy.isPointer()) {
6524     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6525           DstTy.getAddressSpace())) {
6526       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6527       return UnableToLegalize;
6528     }
6529 
6530     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6531   }
6532 
6533   MI.eraseFromParent();
6534   return Legalized;
6535 }
6536 
6537 LegalizerHelper::LegalizeResult
6538 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6539   const unsigned NumDst = MI.getNumOperands() - 1;
6540   Register SrcReg = MI.getOperand(NumDst).getReg();
6541   Register Dst0Reg = MI.getOperand(0).getReg();
6542   LLT DstTy = MRI.getType(Dst0Reg);
6543   if (DstTy.isPointer())
6544     return UnableToLegalize; // TODO
6545 
6546   SrcReg = coerceToScalar(SrcReg);
6547   if (!SrcReg)
6548     return UnableToLegalize;
6549 
6550   // Expand scalarizing unmerge as bitcast to integer and shift.
6551   LLT IntTy = MRI.getType(SrcReg);
6552 
6553   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6554 
6555   const unsigned DstSize = DstTy.getSizeInBits();
6556   unsigned Offset = DstSize;
6557   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6558     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6559     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6560     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6561   }
6562 
6563   MI.eraseFromParent();
6564   return Legalized;
6565 }
6566 
6567 /// Lower a vector extract or insert by writing the vector to a stack temporary
6568 /// and reloading the element or vector.
6569 ///
6570 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6571 ///  =>
6572 ///  %stack_temp = G_FRAME_INDEX
6573 ///  G_STORE %vec, %stack_temp
6574 ///  %idx = clamp(%idx, %vec.getNumElements())
6575 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6576 ///  %dst = G_LOAD %element_ptr
6577 LegalizerHelper::LegalizeResult
6578 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6579   Register DstReg = MI.getOperand(0).getReg();
6580   Register SrcVec = MI.getOperand(1).getReg();
6581   Register InsertVal;
6582   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6583     InsertVal = MI.getOperand(2).getReg();
6584 
6585   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6586 
6587   LLT VecTy = MRI.getType(SrcVec);
6588   LLT EltTy = VecTy.getElementType();
6589   unsigned NumElts = VecTy.getNumElements();
6590 
6591   int64_t IdxVal;
6592   if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
6593     SmallVector<Register, 8> SrcRegs;
6594     extractParts(SrcVec, EltTy, NumElts, SrcRegs);
6595 
6596     if (InsertVal) {
6597       SrcRegs[IdxVal] = MI.getOperand(2).getReg();
6598       MIRBuilder.buildMerge(DstReg, SrcRegs);
6599     } else {
6600       MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
6601     }
6602 
6603     MI.eraseFromParent();
6604     return Legalized;
6605   }
6606 
6607   if (!EltTy.isByteSized()) { // Not implemented.
6608     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6609     return UnableToLegalize;
6610   }
6611 
6612   unsigned EltBytes = EltTy.getSizeInBytes();
6613   Align VecAlign = getStackTemporaryAlignment(VecTy);
6614   Align EltAlign;
6615 
6616   MachinePointerInfo PtrInfo;
6617   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6618                                         VecAlign, PtrInfo);
6619   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6620 
6621   // Get the pointer to the element, and be sure not to hit undefined behavior
6622   // if the index is out of bounds.
6623   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6624 
6625   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6626     int64_t Offset = IdxVal * EltBytes;
6627     PtrInfo = PtrInfo.getWithOffset(Offset);
6628     EltAlign = commonAlignment(VecAlign, Offset);
6629   } else {
6630     // We lose information with a variable offset.
6631     EltAlign = getStackTemporaryAlignment(EltTy);
6632     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6633   }
6634 
6635   if (InsertVal) {
6636     // Write the inserted element
6637     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6638 
6639     // Reload the whole vector.
6640     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6641   } else {
6642     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6643   }
6644 
6645   MI.eraseFromParent();
6646   return Legalized;
6647 }
6648 
6649 LegalizerHelper::LegalizeResult
6650 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6651   Register DstReg = MI.getOperand(0).getReg();
6652   Register Src0Reg = MI.getOperand(1).getReg();
6653   Register Src1Reg = MI.getOperand(2).getReg();
6654   LLT Src0Ty = MRI.getType(Src0Reg);
6655   LLT DstTy = MRI.getType(DstReg);
6656   LLT IdxTy = LLT::scalar(32);
6657 
6658   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6659 
6660   if (DstTy.isScalar()) {
6661     if (Src0Ty.isVector())
6662       return UnableToLegalize;
6663 
6664     // This is just a SELECT.
6665     assert(Mask.size() == 1 && "Expected a single mask element");
6666     Register Val;
6667     if (Mask[0] < 0 || Mask[0] > 1)
6668       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6669     else
6670       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6671     MIRBuilder.buildCopy(DstReg, Val);
6672     MI.eraseFromParent();
6673     return Legalized;
6674   }
6675 
6676   Register Undef;
6677   SmallVector<Register, 32> BuildVec;
6678   LLT EltTy = DstTy.getElementType();
6679 
6680   for (int Idx : Mask) {
6681     if (Idx < 0) {
6682       if (!Undef.isValid())
6683         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6684       BuildVec.push_back(Undef);
6685       continue;
6686     }
6687 
6688     if (Src0Ty.isScalar()) {
6689       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6690     } else {
6691       int NumElts = Src0Ty.getNumElements();
6692       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6693       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6694       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6695       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6696       BuildVec.push_back(Extract.getReg(0));
6697     }
6698   }
6699 
6700   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6701   MI.eraseFromParent();
6702   return Legalized;
6703 }
6704 
6705 LegalizerHelper::LegalizeResult
6706 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6707   const auto &MF = *MI.getMF();
6708   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6709   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6710     return UnableToLegalize;
6711 
6712   Register Dst = MI.getOperand(0).getReg();
6713   Register AllocSize = MI.getOperand(1).getReg();
6714   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6715 
6716   LLT PtrTy = MRI.getType(Dst);
6717   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6718 
6719   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6720   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6721   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6722 
6723   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6724   // have to generate an extra instruction to negate the alloc and then use
6725   // G_PTR_ADD to add the negative offset.
6726   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6727   if (Alignment > Align(1)) {
6728     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6729     AlignMask.negate();
6730     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6731     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6732   }
6733 
6734   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6735   MIRBuilder.buildCopy(SPReg, SPTmp);
6736   MIRBuilder.buildCopy(Dst, SPTmp);
6737 
6738   MI.eraseFromParent();
6739   return Legalized;
6740 }
6741 
6742 LegalizerHelper::LegalizeResult
6743 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6744   Register Dst = MI.getOperand(0).getReg();
6745   Register Src = MI.getOperand(1).getReg();
6746   unsigned Offset = MI.getOperand(2).getImm();
6747 
6748   LLT DstTy = MRI.getType(Dst);
6749   LLT SrcTy = MRI.getType(Src);
6750 
6751   // Extract sub-vector or one element
6752   if (SrcTy.isVector()) {
6753     unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
6754     unsigned DstSize = DstTy.getSizeInBits();
6755 
6756     if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
6757         (Offset + DstSize <= SrcTy.getSizeInBits())) {
6758       // Unmerge and allow access to each Src element for the artifact combiner.
6759       auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), Src);
6760 
6761       // Take element(s) we need to extract and copy it (merge them).
6762       SmallVector<Register, 8> SubVectorElts;
6763       for (unsigned Idx = Offset / SrcEltSize;
6764            Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
6765         SubVectorElts.push_back(Unmerge.getReg(Idx));
6766       }
6767       if (SubVectorElts.size() == 1)
6768         MIRBuilder.buildCopy(Dst, SubVectorElts[0]);
6769       else
6770         MIRBuilder.buildMerge(Dst, SubVectorElts);
6771 
6772       MI.eraseFromParent();
6773       return Legalized;
6774     }
6775   }
6776 
6777   if (DstTy.isScalar() &&
6778       (SrcTy.isScalar() ||
6779        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6780     LLT SrcIntTy = SrcTy;
6781     if (!SrcTy.isScalar()) {
6782       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6783       Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6784     }
6785 
6786     if (Offset == 0)
6787       MIRBuilder.buildTrunc(Dst, Src);
6788     else {
6789       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6790       auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6791       MIRBuilder.buildTrunc(Dst, Shr);
6792     }
6793 
6794     MI.eraseFromParent();
6795     return Legalized;
6796   }
6797 
6798   return UnableToLegalize;
6799 }
6800 
6801 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6802   Register Dst = MI.getOperand(0).getReg();
6803   Register Src = MI.getOperand(1).getReg();
6804   Register InsertSrc = MI.getOperand(2).getReg();
6805   uint64_t Offset = MI.getOperand(3).getImm();
6806 
6807   LLT DstTy = MRI.getType(Src);
6808   LLT InsertTy = MRI.getType(InsertSrc);
6809 
6810   // Insert sub-vector or one element
6811   if (DstTy.isVector() && !InsertTy.isPointer()) {
6812     LLT EltTy = DstTy.getElementType();
6813     unsigned EltSize = EltTy.getSizeInBits();
6814     unsigned InsertSize = InsertTy.getSizeInBits();
6815 
6816     if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
6817         (Offset + InsertSize <= DstTy.getSizeInBits())) {
6818       auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
6819       SmallVector<Register, 8> DstElts;
6820       unsigned Idx = 0;
6821       // Elements from Src before insert start Offset
6822       for (; Idx < Offset / EltSize; ++Idx) {
6823         DstElts.push_back(UnmergeSrc.getReg(Idx));
6824       }
6825 
6826       // Replace elements in Src with elements from InsertSrc
6827       if (InsertTy.getSizeInBits() > EltSize) {
6828         auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
6829         for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
6830              ++Idx, ++i) {
6831           DstElts.push_back(UnmergeInsertSrc.getReg(i));
6832         }
6833       } else {
6834         DstElts.push_back(InsertSrc);
6835         ++Idx;
6836       }
6837 
6838       // Remaining elements from Src after insert
6839       for (; Idx < DstTy.getNumElements(); ++Idx) {
6840         DstElts.push_back(UnmergeSrc.getReg(Idx));
6841       }
6842 
6843       MIRBuilder.buildMerge(Dst, DstElts);
6844       MI.eraseFromParent();
6845       return Legalized;
6846     }
6847   }
6848 
6849   if (InsertTy.isVector() ||
6850       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6851     return UnableToLegalize;
6852 
6853   const DataLayout &DL = MIRBuilder.getDataLayout();
6854   if ((DstTy.isPointer() &&
6855        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6856       (InsertTy.isPointer() &&
6857        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6858     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6859     return UnableToLegalize;
6860   }
6861 
6862   LLT IntDstTy = DstTy;
6863 
6864   if (!DstTy.isScalar()) {
6865     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6866     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6867   }
6868 
6869   if (!InsertTy.isScalar()) {
6870     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6871     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6872   }
6873 
6874   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6875   if (Offset != 0) {
6876     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6877     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6878   }
6879 
6880   APInt MaskVal = APInt::getBitsSetWithWrap(
6881       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6882 
6883   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6884   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6885   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6886 
6887   MIRBuilder.buildCast(Dst, Or);
6888   MI.eraseFromParent();
6889   return Legalized;
6890 }
6891 
6892 LegalizerHelper::LegalizeResult
6893 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
6894   Register Dst0 = MI.getOperand(0).getReg();
6895   Register Dst1 = MI.getOperand(1).getReg();
6896   Register LHS = MI.getOperand(2).getReg();
6897   Register RHS = MI.getOperand(3).getReg();
6898   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
6899 
6900   LLT Ty = MRI.getType(Dst0);
6901   LLT BoolTy = MRI.getType(Dst1);
6902 
6903   if (IsAdd)
6904     MIRBuilder.buildAdd(Dst0, LHS, RHS);
6905   else
6906     MIRBuilder.buildSub(Dst0, LHS, RHS);
6907 
6908   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
6909 
6910   auto Zero = MIRBuilder.buildConstant(Ty, 0);
6911 
6912   // For an addition, the result should be less than one of the operands (LHS)
6913   // if and only if the other operand (RHS) is negative, otherwise there will
6914   // be overflow.
6915   // For a subtraction, the result should be less than one of the operands
6916   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
6917   // otherwise there will be overflow.
6918   auto ResultLowerThanLHS =
6919       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
6920   auto ConditionRHS = MIRBuilder.buildICmp(
6921       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
6922 
6923   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
6924   MI.eraseFromParent();
6925   return Legalized;
6926 }
6927 
6928 LegalizerHelper::LegalizeResult
6929 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
6930   Register Res = MI.getOperand(0).getReg();
6931   Register LHS = MI.getOperand(1).getReg();
6932   Register RHS = MI.getOperand(2).getReg();
6933   LLT Ty = MRI.getType(Res);
6934   bool IsSigned;
6935   bool IsAdd;
6936   unsigned BaseOp;
6937   switch (MI.getOpcode()) {
6938   default:
6939     llvm_unreachable("unexpected addsat/subsat opcode");
6940   case TargetOpcode::G_UADDSAT:
6941     IsSigned = false;
6942     IsAdd = true;
6943     BaseOp = TargetOpcode::G_ADD;
6944     break;
6945   case TargetOpcode::G_SADDSAT:
6946     IsSigned = true;
6947     IsAdd = true;
6948     BaseOp = TargetOpcode::G_ADD;
6949     break;
6950   case TargetOpcode::G_USUBSAT:
6951     IsSigned = false;
6952     IsAdd = false;
6953     BaseOp = TargetOpcode::G_SUB;
6954     break;
6955   case TargetOpcode::G_SSUBSAT:
6956     IsSigned = true;
6957     IsAdd = false;
6958     BaseOp = TargetOpcode::G_SUB;
6959     break;
6960   }
6961 
6962   if (IsSigned) {
6963     // sadd.sat(a, b) ->
6964     //   hi = 0x7fffffff - smax(a, 0)
6965     //   lo = 0x80000000 - smin(a, 0)
6966     //   a + smin(smax(lo, b), hi)
6967     // ssub.sat(a, b) ->
6968     //   lo = smax(a, -1) - 0x7fffffff
6969     //   hi = smin(a, -1) - 0x80000000
6970     //   a - smin(smax(lo, b), hi)
6971     // TODO: AMDGPU can use a "median of 3" instruction here:
6972     //   a +/- med3(lo, b, hi)
6973     uint64_t NumBits = Ty.getScalarSizeInBits();
6974     auto MaxVal =
6975         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
6976     auto MinVal =
6977         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
6978     MachineInstrBuilder Hi, Lo;
6979     if (IsAdd) {
6980       auto Zero = MIRBuilder.buildConstant(Ty, 0);
6981       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
6982       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
6983     } else {
6984       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
6985       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
6986                                MaxVal);
6987       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
6988                                MinVal);
6989     }
6990     auto RHSClamped =
6991         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
6992     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
6993   } else {
6994     // uadd.sat(a, b) -> a + umin(~a, b)
6995     // usub.sat(a, b) -> a - umin(a, b)
6996     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
6997     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
6998     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
6999   }
7000 
7001   MI.eraseFromParent();
7002   return Legalized;
7003 }
7004 
7005 LegalizerHelper::LegalizeResult
7006 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7007   Register Res = MI.getOperand(0).getReg();
7008   Register LHS = MI.getOperand(1).getReg();
7009   Register RHS = MI.getOperand(2).getReg();
7010   LLT Ty = MRI.getType(Res);
7011   LLT BoolTy = Ty.changeElementSize(1);
7012   bool IsSigned;
7013   bool IsAdd;
7014   unsigned OverflowOp;
7015   switch (MI.getOpcode()) {
7016   default:
7017     llvm_unreachable("unexpected addsat/subsat opcode");
7018   case TargetOpcode::G_UADDSAT:
7019     IsSigned = false;
7020     IsAdd = true;
7021     OverflowOp = TargetOpcode::G_UADDO;
7022     break;
7023   case TargetOpcode::G_SADDSAT:
7024     IsSigned = true;
7025     IsAdd = true;
7026     OverflowOp = TargetOpcode::G_SADDO;
7027     break;
7028   case TargetOpcode::G_USUBSAT:
7029     IsSigned = false;
7030     IsAdd = false;
7031     OverflowOp = TargetOpcode::G_USUBO;
7032     break;
7033   case TargetOpcode::G_SSUBSAT:
7034     IsSigned = true;
7035     IsAdd = false;
7036     OverflowOp = TargetOpcode::G_SSUBO;
7037     break;
7038   }
7039 
7040   auto OverflowRes =
7041       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7042   Register Tmp = OverflowRes.getReg(0);
7043   Register Ov = OverflowRes.getReg(1);
7044   MachineInstrBuilder Clamp;
7045   if (IsSigned) {
7046     // sadd.sat(a, b) ->
7047     //   {tmp, ov} = saddo(a, b)
7048     //   ov ? (tmp >>s 31) + 0x80000000 : r
7049     // ssub.sat(a, b) ->
7050     //   {tmp, ov} = ssubo(a, b)
7051     //   ov ? (tmp >>s 31) + 0x80000000 : r
7052     uint64_t NumBits = Ty.getScalarSizeInBits();
7053     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7054     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7055     auto MinVal =
7056         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7057     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7058   } else {
7059     // uadd.sat(a, b) ->
7060     //   {tmp, ov} = uaddo(a, b)
7061     //   ov ? 0xffffffff : tmp
7062     // usub.sat(a, b) ->
7063     //   {tmp, ov} = usubo(a, b)
7064     //   ov ? 0 : tmp
7065     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7066   }
7067   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7068 
7069   MI.eraseFromParent();
7070   return Legalized;
7071 }
7072 
7073 LegalizerHelper::LegalizeResult
7074 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7075   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7076           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7077          "Expected shlsat opcode!");
7078   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7079   Register Res = MI.getOperand(0).getReg();
7080   Register LHS = MI.getOperand(1).getReg();
7081   Register RHS = MI.getOperand(2).getReg();
7082   LLT Ty = MRI.getType(Res);
7083   LLT BoolTy = Ty.changeElementSize(1);
7084 
7085   unsigned BW = Ty.getScalarSizeInBits();
7086   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7087   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7088                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7089 
7090   MachineInstrBuilder SatVal;
7091   if (IsSigned) {
7092     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7093     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7094     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7095                                     MIRBuilder.buildConstant(Ty, 0));
7096     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7097   } else {
7098     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7099   }
7100   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7101   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7102 
7103   MI.eraseFromParent();
7104   return Legalized;
7105 }
7106 
7107 LegalizerHelper::LegalizeResult
7108 LegalizerHelper::lowerBswap(MachineInstr &MI) {
7109   Register Dst = MI.getOperand(0).getReg();
7110   Register Src = MI.getOperand(1).getReg();
7111   const LLT Ty = MRI.getType(Src);
7112   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7113   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7114 
7115   // Swap most and least significant byte, set remaining bytes in Res to zero.
7116   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7117   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7118   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7119   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7120 
7121   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7122   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7123     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7124     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7125     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7126     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7127     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7128     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7129     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7130     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7131     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7132     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7133     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7134     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7135   }
7136   Res.getInstr()->getOperand(0).setReg(Dst);
7137 
7138   MI.eraseFromParent();
7139   return Legalized;
7140 }
7141 
7142 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7143 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7144                                  MachineInstrBuilder Src, APInt Mask) {
7145   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7146   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7147   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7148   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7149   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7150   return B.buildOr(Dst, LHS, RHS);
7151 }
7152 
7153 LegalizerHelper::LegalizeResult
7154 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7155   Register Dst = MI.getOperand(0).getReg();
7156   Register Src = MI.getOperand(1).getReg();
7157   const LLT Ty = MRI.getType(Src);
7158   unsigned Size = Ty.getSizeInBits();
7159 
7160   MachineInstrBuilder BSWAP =
7161       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7162 
7163   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7164   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7165   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7166   MachineInstrBuilder Swap4 =
7167       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7168 
7169   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7170   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7171   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7172   MachineInstrBuilder Swap2 =
7173       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7174 
7175   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7176   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7177   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7178   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7179 
7180   MI.eraseFromParent();
7181   return Legalized;
7182 }
7183 
7184 LegalizerHelper::LegalizeResult
7185 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7186   MachineFunction &MF = MIRBuilder.getMF();
7187 
7188   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7189   int NameOpIdx = IsRead ? 1 : 0;
7190   int ValRegIndex = IsRead ? 0 : 1;
7191 
7192   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7193   const LLT Ty = MRI.getType(ValReg);
7194   const MDString *RegStr = cast<MDString>(
7195     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7196 
7197   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7198   if (!PhysReg.isValid())
7199     return UnableToLegalize;
7200 
7201   if (IsRead)
7202     MIRBuilder.buildCopy(ValReg, PhysReg);
7203   else
7204     MIRBuilder.buildCopy(PhysReg, ValReg);
7205 
7206   MI.eraseFromParent();
7207   return Legalized;
7208 }
7209 
7210 LegalizerHelper::LegalizeResult
7211 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7212   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7213   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7214   Register Result = MI.getOperand(0).getReg();
7215   LLT OrigTy = MRI.getType(Result);
7216   auto SizeInBits = OrigTy.getScalarSizeInBits();
7217   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7218 
7219   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7220   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7221   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7222   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7223 
7224   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7225   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7226   MIRBuilder.buildTrunc(Result, Shifted);
7227 
7228   MI.eraseFromParent();
7229   return Legalized;
7230 }
7231 
7232 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7233   // Implement vector G_SELECT in terms of XOR, AND, OR.
7234   Register DstReg = MI.getOperand(0).getReg();
7235   Register MaskReg = MI.getOperand(1).getReg();
7236   Register Op1Reg = MI.getOperand(2).getReg();
7237   Register Op2Reg = MI.getOperand(3).getReg();
7238   LLT DstTy = MRI.getType(DstReg);
7239   LLT MaskTy = MRI.getType(MaskReg);
7240   LLT Op1Ty = MRI.getType(Op1Reg);
7241   if (!DstTy.isVector())
7242     return UnableToLegalize;
7243 
7244   // Vector selects can have a scalar predicate. If so, splat into a vector and
7245   // finish for later legalization attempts to try again.
7246   if (MaskTy.isScalar()) {
7247     Register MaskElt = MaskReg;
7248     if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
7249       MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
7250     // Generate a vector splat idiom to be pattern matched later.
7251     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7252     Observer.changingInstr(MI);
7253     MI.getOperand(1).setReg(ShufSplat.getReg(0));
7254     Observer.changedInstr(MI);
7255     return Legalized;
7256   }
7257 
7258   if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
7259     return UnableToLegalize;
7260   }
7261 
7262   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7263   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7264   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7265   MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7266   MI.eraseFromParent();
7267   return Legalized;
7268 }
7269 
7270 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7271   // Split DIVREM into individual instructions.
7272   unsigned Opcode = MI.getOpcode();
7273 
7274   MIRBuilder.buildInstr(
7275       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7276                                         : TargetOpcode::G_UDIV,
7277       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7278   MIRBuilder.buildInstr(
7279       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7280                                         : TargetOpcode::G_UREM,
7281       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7282   MI.eraseFromParent();
7283   return Legalized;
7284 }
7285 
7286 LegalizerHelper::LegalizeResult
7287 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7288   // Expand %res = G_ABS %a into:
7289   // %v1 = G_ASHR %a, scalar_size-1
7290   // %v2 = G_ADD %a, %v1
7291   // %res = G_XOR %v2, %v1
7292   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7293   Register OpReg = MI.getOperand(1).getReg();
7294   auto ShiftAmt =
7295       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7296   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7297   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7298   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7299   MI.eraseFromParent();
7300   return Legalized;
7301 }
7302 
7303 LegalizerHelper::LegalizeResult
7304 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7305   // Expand %res = G_ABS %a into:
7306   // %v1 = G_CONSTANT 0
7307   // %v2 = G_SUB %v1, %a
7308   // %res = G_SMAX %a, %v2
7309   Register SrcReg = MI.getOperand(1).getReg();
7310   LLT Ty = MRI.getType(SrcReg);
7311   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7312   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7313   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7314   MI.eraseFromParent();
7315   return Legalized;
7316 }
7317 
7318 LegalizerHelper::LegalizeResult
7319 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
7320   Register SrcReg = MI.getOperand(1).getReg();
7321   LLT SrcTy = MRI.getType(SrcReg);
7322   LLT DstTy = MRI.getType(SrcReg);
7323 
7324   // The source could be a scalar if the IR type was <1 x sN>.
7325   if (SrcTy.isScalar()) {
7326     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
7327       return UnableToLegalize; // FIXME: handle extension.
7328     // This can be just a plain copy.
7329     Observer.changingInstr(MI);
7330     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
7331     Observer.changedInstr(MI);
7332     return Legalized;
7333   }
7334   return UnableToLegalize;;
7335 }
7336 
7337 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
7338   // On Darwin, -Os means optimize for size without hurting performance, so
7339   // only really optimize for size when -Oz (MinSize) is used.
7340   if (MF.getTarget().getTargetTriple().isOSDarwin())
7341     return MF.getFunction().hasMinSize();
7342   return MF.getFunction().hasOptSize();
7343 }
7344 
7345 // Returns a list of types to use for memory op lowering in MemOps. A partial
7346 // port of findOptimalMemOpLowering in TargetLowering.
7347 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
7348                                           unsigned Limit, const MemOp &Op,
7349                                           unsigned DstAS, unsigned SrcAS,
7350                                           const AttributeList &FuncAttributes,
7351                                           const TargetLowering &TLI) {
7352   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
7353     return false;
7354 
7355   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
7356 
7357   if (Ty == LLT()) {
7358     // Use the largest scalar type whose alignment constraints are satisfied.
7359     // We only need to check DstAlign here as SrcAlign is always greater or
7360     // equal to DstAlign (or zero).
7361     Ty = LLT::scalar(64);
7362     if (Op.isFixedDstAlign())
7363       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
7364              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
7365         Ty = LLT::scalar(Ty.getSizeInBytes());
7366     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
7367     // FIXME: check for the largest legal type we can load/store to.
7368   }
7369 
7370   unsigned NumMemOps = 0;
7371   uint64_t Size = Op.size();
7372   while (Size) {
7373     unsigned TySize = Ty.getSizeInBytes();
7374     while (TySize > Size) {
7375       // For now, only use non-vector load / store's for the left-over pieces.
7376       LLT NewTy = Ty;
7377       // FIXME: check for mem op safety and legality of the types. Not all of
7378       // SDAGisms map cleanly to GISel concepts.
7379       if (NewTy.isVector())
7380         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
7381       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
7382       unsigned NewTySize = NewTy.getSizeInBytes();
7383       assert(NewTySize > 0 && "Could not find appropriate type");
7384 
7385       // If the new LLT cannot cover all of the remaining bits, then consider
7386       // issuing a (or a pair of) unaligned and overlapping load / store.
7387       bool Fast;
7388       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
7389       MVT VT = getMVTForLLT(Ty);
7390       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
7391           TLI.allowsMisalignedMemoryAccesses(
7392               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
7393               MachineMemOperand::MONone, &Fast) &&
7394           Fast)
7395         TySize = Size;
7396       else {
7397         Ty = NewTy;
7398         TySize = NewTySize;
7399       }
7400     }
7401 
7402     if (++NumMemOps > Limit)
7403       return false;
7404 
7405     MemOps.push_back(Ty);
7406     Size -= TySize;
7407   }
7408 
7409   return true;
7410 }
7411 
7412 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
7413   if (Ty.isVector())
7414     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
7415                                 Ty.getNumElements());
7416   return IntegerType::get(C, Ty.getSizeInBits());
7417 }
7418 
7419 // Get a vectorized representation of the memset value operand, GISel edition.
7420 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
7421   MachineRegisterInfo &MRI = *MIB.getMRI();
7422   unsigned NumBits = Ty.getScalarSizeInBits();
7423   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7424   if (!Ty.isVector() && ValVRegAndVal) {
7425     APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
7426     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
7427     return MIB.buildConstant(Ty, SplatVal).getReg(0);
7428   }
7429 
7430   // Extend the byte value to the larger type, and then multiply by a magic
7431   // value 0x010101... in order to replicate it across every byte.
7432   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
7433   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
7434     return MIB.buildConstant(Ty, 0).getReg(0);
7435   }
7436 
7437   LLT ExtType = Ty.getScalarType();
7438   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
7439   if (NumBits > 8) {
7440     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
7441     auto MagicMI = MIB.buildConstant(ExtType, Magic);
7442     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
7443   }
7444 
7445   // For vector types create a G_BUILD_VECTOR.
7446   if (Ty.isVector())
7447     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
7448 
7449   return Val;
7450 }
7451 
7452 LegalizerHelper::LegalizeResult
7453 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
7454                              uint64_t KnownLen, Align Alignment,
7455                              bool IsVolatile) {
7456   auto &MF = *MI.getParent()->getParent();
7457   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7458   auto &DL = MF.getDataLayout();
7459   LLVMContext &C = MF.getFunction().getContext();
7460 
7461   assert(KnownLen != 0 && "Have a zero length memset length!");
7462 
7463   bool DstAlignCanChange = false;
7464   MachineFrameInfo &MFI = MF.getFrameInfo();
7465   bool OptSize = shouldLowerMemFuncForSize(MF);
7466 
7467   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7468   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7469     DstAlignCanChange = true;
7470 
7471   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
7472   std::vector<LLT> MemOps;
7473 
7474   const auto &DstMMO = **MI.memoperands_begin();
7475   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7476 
7477   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7478   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
7479 
7480   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
7481                                      MemOp::Set(KnownLen, DstAlignCanChange,
7482                                                 Alignment,
7483                                                 /*IsZeroMemset=*/IsZeroVal,
7484                                                 /*IsVolatile=*/IsVolatile),
7485                                      DstPtrInfo.getAddrSpace(), ~0u,
7486                                      MF.getFunction().getAttributes(), TLI))
7487     return UnableToLegalize;
7488 
7489   if (DstAlignCanChange) {
7490     // Get an estimate of the type from the LLT.
7491     Type *IRTy = getTypeForLLT(MemOps[0], C);
7492     Align NewAlign = DL.getABITypeAlign(IRTy);
7493     if (NewAlign > Alignment) {
7494       Alignment = NewAlign;
7495       unsigned FI = FIDef->getOperand(1).getIndex();
7496       // Give the stack frame object a larger alignment if needed.
7497       if (MFI.getObjectAlign(FI) < Alignment)
7498         MFI.setObjectAlignment(FI, Alignment);
7499     }
7500   }
7501 
7502   MachineIRBuilder MIB(MI);
7503   // Find the largest store and generate the bit pattern for it.
7504   LLT LargestTy = MemOps[0];
7505   for (unsigned i = 1; i < MemOps.size(); i++)
7506     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
7507       LargestTy = MemOps[i];
7508 
7509   // The memset stored value is always defined as an s8, so in order to make it
7510   // work with larger store types we need to repeat the bit pattern across the
7511   // wider type.
7512   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
7513 
7514   if (!MemSetValue)
7515     return UnableToLegalize;
7516 
7517   // Generate the stores. For each store type in the list, we generate the
7518   // matching store of that type to the destination address.
7519   LLT PtrTy = MRI.getType(Dst);
7520   unsigned DstOff = 0;
7521   unsigned Size = KnownLen;
7522   for (unsigned I = 0; I < MemOps.size(); I++) {
7523     LLT Ty = MemOps[I];
7524     unsigned TySize = Ty.getSizeInBytes();
7525     if (TySize > Size) {
7526       // Issuing an unaligned load / store pair that overlaps with the previous
7527       // pair. Adjust the offset accordingly.
7528       assert(I == MemOps.size() - 1 && I != 0);
7529       DstOff -= TySize - Size;
7530     }
7531 
7532     // If this store is smaller than the largest store see whether we can get
7533     // the smaller value for free with a truncate.
7534     Register Value = MemSetValue;
7535     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
7536       MVT VT = getMVTForLLT(Ty);
7537       MVT LargestVT = getMVTForLLT(LargestTy);
7538       if (!LargestTy.isVector() && !Ty.isVector() &&
7539           TLI.isTruncateFree(LargestVT, VT))
7540         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
7541       else
7542         Value = getMemsetValue(Val, Ty, MIB);
7543       if (!Value)
7544         return UnableToLegalize;
7545     }
7546 
7547     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
7548 
7549     Register Ptr = Dst;
7550     if (DstOff != 0) {
7551       auto Offset =
7552           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
7553       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7554     }
7555 
7556     MIB.buildStore(Value, Ptr, *StoreMMO);
7557     DstOff += Ty.getSizeInBytes();
7558     Size -= TySize;
7559   }
7560 
7561   MI.eraseFromParent();
7562   return Legalized;
7563 }
7564 
7565 LegalizerHelper::LegalizeResult
7566 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
7567   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7568 
7569   Register Dst = MI.getOperand(0).getReg();
7570   Register Src = MI.getOperand(1).getReg();
7571   Register Len = MI.getOperand(2).getReg();
7572 
7573   const auto *MMOIt = MI.memoperands_begin();
7574   const MachineMemOperand *MemOp = *MMOIt;
7575   bool IsVolatile = MemOp->isVolatile();
7576 
7577   // See if this is a constant length copy
7578   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7579   // FIXME: support dynamically sized G_MEMCPY_INLINE
7580   assert(LenVRegAndVal.hasValue() &&
7581          "inline memcpy with dynamic size is not yet supported");
7582   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7583   if (KnownLen == 0) {
7584     MI.eraseFromParent();
7585     return Legalized;
7586   }
7587 
7588   const auto &DstMMO = **MI.memoperands_begin();
7589   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7590   Align DstAlign = DstMMO.getBaseAlign();
7591   Align SrcAlign = SrcMMO.getBaseAlign();
7592 
7593   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7594                            IsVolatile);
7595 }
7596 
7597 LegalizerHelper::LegalizeResult
7598 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
7599                                    uint64_t KnownLen, Align DstAlign,
7600                                    Align SrcAlign, bool IsVolatile) {
7601   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7602   return lowerMemcpy(MI, Dst, Src, KnownLen,
7603                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
7604                      IsVolatile);
7605 }
7606 
7607 LegalizerHelper::LegalizeResult
7608 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
7609                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
7610                              Align SrcAlign, bool IsVolatile) {
7611   auto &MF = *MI.getParent()->getParent();
7612   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7613   auto &DL = MF.getDataLayout();
7614   LLVMContext &C = MF.getFunction().getContext();
7615 
7616   assert(KnownLen != 0 && "Have a zero length memcpy length!");
7617 
7618   bool DstAlignCanChange = false;
7619   MachineFrameInfo &MFI = MF.getFrameInfo();
7620   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7621 
7622   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7623   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7624     DstAlignCanChange = true;
7625 
7626   // FIXME: infer better src pointer alignment like SelectionDAG does here.
7627   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
7628   // if the memcpy is in a tail call position.
7629 
7630   std::vector<LLT> MemOps;
7631 
7632   const auto &DstMMO = **MI.memoperands_begin();
7633   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7634   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7635   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7636 
7637   if (!findGISelOptimalMemOpLowering(
7638           MemOps, Limit,
7639           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7640                       IsVolatile),
7641           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7642           MF.getFunction().getAttributes(), TLI))
7643     return UnableToLegalize;
7644 
7645   if (DstAlignCanChange) {
7646     // Get an estimate of the type from the LLT.
7647     Type *IRTy = getTypeForLLT(MemOps[0], C);
7648     Align NewAlign = DL.getABITypeAlign(IRTy);
7649 
7650     // Don't promote to an alignment that would require dynamic stack
7651     // realignment.
7652     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7653     if (!TRI->hasStackRealignment(MF))
7654       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7655         NewAlign = NewAlign / 2;
7656 
7657     if (NewAlign > Alignment) {
7658       Alignment = NewAlign;
7659       unsigned FI = FIDef->getOperand(1).getIndex();
7660       // Give the stack frame object a larger alignment if needed.
7661       if (MFI.getObjectAlign(FI) < Alignment)
7662         MFI.setObjectAlignment(FI, Alignment);
7663     }
7664   }
7665 
7666   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
7667 
7668   MachineIRBuilder MIB(MI);
7669   // Now we need to emit a pair of load and stores for each of the types we've
7670   // collected. I.e. for each type, generate a load from the source pointer of
7671   // that type width, and then generate a corresponding store to the dest buffer
7672   // of that value loaded. This can result in a sequence of loads and stores
7673   // mixed types, depending on what the target specifies as good types to use.
7674   unsigned CurrOffset = 0;
7675   unsigned Size = KnownLen;
7676   for (auto CopyTy : MemOps) {
7677     // Issuing an unaligned load / store pair  that overlaps with the previous
7678     // pair. Adjust the offset accordingly.
7679     if (CopyTy.getSizeInBytes() > Size)
7680       CurrOffset -= CopyTy.getSizeInBytes() - Size;
7681 
7682     // Construct MMOs for the accesses.
7683     auto *LoadMMO =
7684         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7685     auto *StoreMMO =
7686         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7687 
7688     // Create the load.
7689     Register LoadPtr = Src;
7690     Register Offset;
7691     if (CurrOffset != 0) {
7692       LLT SrcTy = MRI.getType(Src);
7693       Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
7694                    .getReg(0);
7695       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7696     }
7697     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
7698 
7699     // Create the store.
7700     Register StorePtr = Dst;
7701     if (CurrOffset != 0) {
7702       LLT DstTy = MRI.getType(Dst);
7703       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7704     }
7705     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
7706     CurrOffset += CopyTy.getSizeInBytes();
7707     Size -= CopyTy.getSizeInBytes();
7708   }
7709 
7710   MI.eraseFromParent();
7711   return Legalized;
7712 }
7713 
7714 LegalizerHelper::LegalizeResult
7715 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
7716                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
7717                               bool IsVolatile) {
7718   auto &MF = *MI.getParent()->getParent();
7719   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7720   auto &DL = MF.getDataLayout();
7721   LLVMContext &C = MF.getFunction().getContext();
7722 
7723   assert(KnownLen != 0 && "Have a zero length memmove length!");
7724 
7725   bool DstAlignCanChange = false;
7726   MachineFrameInfo &MFI = MF.getFrameInfo();
7727   bool OptSize = shouldLowerMemFuncForSize(MF);
7728   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7729 
7730   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7731   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7732     DstAlignCanChange = true;
7733 
7734   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
7735   std::vector<LLT> MemOps;
7736 
7737   const auto &DstMMO = **MI.memoperands_begin();
7738   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7739   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7740   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7741 
7742   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
7743   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
7744   // same thing here.
7745   if (!findGISelOptimalMemOpLowering(
7746           MemOps, Limit,
7747           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7748                       /*IsVolatile*/ true),
7749           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7750           MF.getFunction().getAttributes(), TLI))
7751     return UnableToLegalize;
7752 
7753   if (DstAlignCanChange) {
7754     // Get an estimate of the type from the LLT.
7755     Type *IRTy = getTypeForLLT(MemOps[0], C);
7756     Align NewAlign = DL.getABITypeAlign(IRTy);
7757 
7758     // Don't promote to an alignment that would require dynamic stack
7759     // realignment.
7760     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7761     if (!TRI->hasStackRealignment(MF))
7762       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7763         NewAlign = NewAlign / 2;
7764 
7765     if (NewAlign > Alignment) {
7766       Alignment = NewAlign;
7767       unsigned FI = FIDef->getOperand(1).getIndex();
7768       // Give the stack frame object a larger alignment if needed.
7769       if (MFI.getObjectAlign(FI) < Alignment)
7770         MFI.setObjectAlignment(FI, Alignment);
7771     }
7772   }
7773 
7774   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
7775 
7776   MachineIRBuilder MIB(MI);
7777   // Memmove requires that we perform the loads first before issuing the stores.
7778   // Apart from that, this loop is pretty much doing the same thing as the
7779   // memcpy codegen function.
7780   unsigned CurrOffset = 0;
7781   SmallVector<Register, 16> LoadVals;
7782   for (auto CopyTy : MemOps) {
7783     // Construct MMO for the load.
7784     auto *LoadMMO =
7785         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7786 
7787     // Create the load.
7788     Register LoadPtr = Src;
7789     if (CurrOffset != 0) {
7790       LLT SrcTy = MRI.getType(Src);
7791       auto Offset =
7792           MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
7793       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7794     }
7795     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
7796     CurrOffset += CopyTy.getSizeInBytes();
7797   }
7798 
7799   CurrOffset = 0;
7800   for (unsigned I = 0; I < MemOps.size(); ++I) {
7801     LLT CopyTy = MemOps[I];
7802     // Now store the values loaded.
7803     auto *StoreMMO =
7804         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7805 
7806     Register StorePtr = Dst;
7807     if (CurrOffset != 0) {
7808       LLT DstTy = MRI.getType(Dst);
7809       auto Offset =
7810           MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
7811       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7812     }
7813     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
7814     CurrOffset += CopyTy.getSizeInBytes();
7815   }
7816   MI.eraseFromParent();
7817   return Legalized;
7818 }
7819 
7820 LegalizerHelper::LegalizeResult
7821 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
7822   const unsigned Opc = MI.getOpcode();
7823   // This combine is fairly complex so it's not written with a separate
7824   // matcher function.
7825   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
7826           Opc == TargetOpcode::G_MEMSET) &&
7827          "Expected memcpy like instruction");
7828 
7829   auto MMOIt = MI.memoperands_begin();
7830   const MachineMemOperand *MemOp = *MMOIt;
7831 
7832   Align DstAlign = MemOp->getBaseAlign();
7833   Align SrcAlign;
7834   Register Dst = MI.getOperand(0).getReg();
7835   Register Src = MI.getOperand(1).getReg();
7836   Register Len = MI.getOperand(2).getReg();
7837 
7838   if (Opc != TargetOpcode::G_MEMSET) {
7839     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
7840     MemOp = *(++MMOIt);
7841     SrcAlign = MemOp->getBaseAlign();
7842   }
7843 
7844   // See if this is a constant length copy
7845   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7846   if (!LenVRegAndVal)
7847     return UnableToLegalize;
7848   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7849 
7850   if (KnownLen == 0) {
7851     MI.eraseFromParent();
7852     return Legalized;
7853   }
7854 
7855   bool IsVolatile = MemOp->isVolatile();
7856   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
7857     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7858                              IsVolatile);
7859 
7860   // Don't try to optimize volatile.
7861   if (IsVolatile)
7862     return UnableToLegalize;
7863 
7864   if (MaxLen && KnownLen > MaxLen)
7865     return UnableToLegalize;
7866 
7867   if (Opc == TargetOpcode::G_MEMCPY) {
7868     auto &MF = *MI.getParent()->getParent();
7869     const auto &TLI = *MF.getSubtarget().getTargetLowering();
7870     bool OptSize = shouldLowerMemFuncForSize(MF);
7871     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
7872     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
7873                        IsVolatile);
7874   }
7875   if (Opc == TargetOpcode::G_MEMMOVE)
7876     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
7877   if (Opc == TargetOpcode::G_MEMSET)
7878     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
7879   return UnableToLegalize;
7880 }
7881