1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/TargetFrameLowering.h"
24 #include "llvm/CodeGen/TargetInstrInfo.h"
25 #include "llvm/CodeGen/TargetLowering.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31 #include "llvm/Support/raw_ostream.h"
32 
33 #define DEBUG_TYPE "legalizer"
34 
35 using namespace llvm;
36 using namespace LegalizeActions;
37 using namespace MIPatternMatch;
38 
39 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
40 ///
41 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
42 /// with any leftover piece as type \p LeftoverTy
43 ///
44 /// Returns -1 in the first element of the pair if the breakdown is not
45 /// satisfiable.
46 static std::pair<int, int>
47 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
48   assert(!LeftoverTy.isValid() && "this is an out argument");
49 
50   unsigned Size = OrigTy.getSizeInBits();
51   unsigned NarrowSize = NarrowTy.getSizeInBits();
52   unsigned NumParts = Size / NarrowSize;
53   unsigned LeftoverSize = Size - NumParts * NarrowSize;
54   assert(Size > NarrowSize);
55 
56   if (LeftoverSize == 0)
57     return {NumParts, 0};
58 
59   if (NarrowTy.isVector()) {
60     unsigned EltSize = OrigTy.getScalarSizeInBits();
61     if (LeftoverSize % EltSize != 0)
62       return {-1, -1};
63     LeftoverTy = LLT::scalarOrVector(
64         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
65   } else {
66     LeftoverTy = LLT::scalar(LeftoverSize);
67   }
68 
69   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
70   return std::make_pair(NumParts, NumLeftover);
71 }
72 
73 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
74 
75   if (!Ty.isScalar())
76     return nullptr;
77 
78   switch (Ty.getSizeInBits()) {
79   case 16:
80     return Type::getHalfTy(Ctx);
81   case 32:
82     return Type::getFloatTy(Ctx);
83   case 64:
84     return Type::getDoubleTy(Ctx);
85   case 80:
86     return Type::getX86_FP80Ty(Ctx);
87   case 128:
88     return Type::getFP128Ty(Ctx);
89   default:
90     return nullptr;
91   }
92 }
93 
94 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
95                                  GISelChangeObserver &Observer,
96                                  MachineIRBuilder &Builder)
97     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
98       LI(*MF.getSubtarget().getLegalizerInfo()),
99       TLI(*MF.getSubtarget().getTargetLowering()) { }
100 
101 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
102                                  GISelChangeObserver &Observer,
103                                  MachineIRBuilder &B)
104   : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
105     TLI(*MF.getSubtarget().getTargetLowering()) { }
106 
107 LegalizerHelper::LegalizeResult
108 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
109                                    LostDebugLocObserver &LocObserver) {
110   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
111 
112   MIRBuilder.setInstrAndDebugLoc(MI);
113 
114   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
115       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
116     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
117   auto Step = LI.getAction(MI, MRI);
118   switch (Step.Action) {
119   case Legal:
120     LLVM_DEBUG(dbgs() << ".. Already legal\n");
121     return AlreadyLegal;
122   case Libcall:
123     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
124     return libcall(MI, LocObserver);
125   case NarrowScalar:
126     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
127     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
128   case WidenScalar:
129     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
130     return widenScalar(MI, Step.TypeIdx, Step.NewType);
131   case Bitcast:
132     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
133     return bitcast(MI, Step.TypeIdx, Step.NewType);
134   case Lower:
135     LLVM_DEBUG(dbgs() << ".. Lower\n");
136     return lower(MI, Step.TypeIdx, Step.NewType);
137   case FewerElements:
138     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
139     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
140   case MoreElements:
141     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
142     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
143   case Custom:
144     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
145     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
146   default:
147     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
148     return UnableToLegalize;
149   }
150 }
151 
152 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
153                                    SmallVectorImpl<Register> &VRegs) {
154   for (int i = 0; i < NumParts; ++i)
155     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
156   MIRBuilder.buildUnmerge(VRegs, Reg);
157 }
158 
159 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
160                                    LLT MainTy, LLT &LeftoverTy,
161                                    SmallVectorImpl<Register> &VRegs,
162                                    SmallVectorImpl<Register> &LeftoverRegs) {
163   assert(!LeftoverTy.isValid() && "this is an out argument");
164 
165   unsigned RegSize = RegTy.getSizeInBits();
166   unsigned MainSize = MainTy.getSizeInBits();
167   unsigned NumParts = RegSize / MainSize;
168   unsigned LeftoverSize = RegSize - NumParts * MainSize;
169 
170   // Use an unmerge when possible.
171   if (LeftoverSize == 0) {
172     for (unsigned I = 0; I < NumParts; ++I)
173       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
174     MIRBuilder.buildUnmerge(VRegs, Reg);
175     return true;
176   }
177 
178   if (MainTy.isVector()) {
179     unsigned EltSize = MainTy.getScalarSizeInBits();
180     if (LeftoverSize % EltSize != 0)
181       return false;
182     LeftoverTy = LLT::scalarOrVector(
183         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
184   } else {
185     LeftoverTy = LLT::scalar(LeftoverSize);
186   }
187 
188   // For irregular sizes, extract the individual parts.
189   for (unsigned I = 0; I != NumParts; ++I) {
190     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
191     VRegs.push_back(NewReg);
192     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
193   }
194 
195   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
196        Offset += LeftoverSize) {
197     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
198     LeftoverRegs.push_back(NewReg);
199     MIRBuilder.buildExtract(NewReg, Reg, Offset);
200   }
201 
202   return true;
203 }
204 
205 void LegalizerHelper::insertParts(Register DstReg,
206                                   LLT ResultTy, LLT PartTy,
207                                   ArrayRef<Register> PartRegs,
208                                   LLT LeftoverTy,
209                                   ArrayRef<Register> LeftoverRegs) {
210   if (!LeftoverTy.isValid()) {
211     assert(LeftoverRegs.empty());
212 
213     if (!ResultTy.isVector()) {
214       MIRBuilder.buildMerge(DstReg, PartRegs);
215       return;
216     }
217 
218     if (PartTy.isVector())
219       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
220     else
221       MIRBuilder.buildBuildVector(DstReg, PartRegs);
222     return;
223   }
224 
225   SmallVector<Register> GCDRegs;
226   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
227   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
228     extractGCDType(GCDRegs, GCDTy, PartReg);
229   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
230   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
231 }
232 
233 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
234 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
235                               const MachineInstr &MI) {
236   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
237 
238   const int StartIdx = Regs.size();
239   const int NumResults = MI.getNumOperands() - 1;
240   Regs.resize(Regs.size() + NumResults);
241   for (int I = 0; I != NumResults; ++I)
242     Regs[StartIdx + I] = MI.getOperand(I).getReg();
243 }
244 
245 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
246                                      LLT GCDTy, Register SrcReg) {
247   LLT SrcTy = MRI.getType(SrcReg);
248   if (SrcTy == GCDTy) {
249     // If the source already evenly divides the result type, we don't need to do
250     // anything.
251     Parts.push_back(SrcReg);
252   } else {
253     // Need to split into common type sized pieces.
254     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
255     getUnmergeResults(Parts, *Unmerge);
256   }
257 }
258 
259 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
260                                     LLT NarrowTy, Register SrcReg) {
261   LLT SrcTy = MRI.getType(SrcReg);
262   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
263   extractGCDType(Parts, GCDTy, SrcReg);
264   return GCDTy;
265 }
266 
267 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
268                                          SmallVectorImpl<Register> &VRegs,
269                                          unsigned PadStrategy) {
270   LLT LCMTy = getLCMType(DstTy, NarrowTy);
271 
272   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
273   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
274   int NumOrigSrc = VRegs.size();
275 
276   Register PadReg;
277 
278   // Get a value we can use to pad the source value if the sources won't evenly
279   // cover the result type.
280   if (NumOrigSrc < NumParts * NumSubParts) {
281     if (PadStrategy == TargetOpcode::G_ZEXT)
282       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
283     else if (PadStrategy == TargetOpcode::G_ANYEXT)
284       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
285     else {
286       assert(PadStrategy == TargetOpcode::G_SEXT);
287 
288       // Shift the sign bit of the low register through the high register.
289       auto ShiftAmt =
290         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
291       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
292     }
293   }
294 
295   // Registers for the final merge to be produced.
296   SmallVector<Register, 4> Remerge(NumParts);
297 
298   // Registers needed for intermediate merges, which will be merged into a
299   // source for Remerge.
300   SmallVector<Register, 4> SubMerge(NumSubParts);
301 
302   // Once we've fully read off the end of the original source bits, we can reuse
303   // the same high bits for remaining padding elements.
304   Register AllPadReg;
305 
306   // Build merges to the LCM type to cover the original result type.
307   for (int I = 0; I != NumParts; ++I) {
308     bool AllMergePartsArePadding = true;
309 
310     // Build the requested merges to the requested type.
311     for (int J = 0; J != NumSubParts; ++J) {
312       int Idx = I * NumSubParts + J;
313       if (Idx >= NumOrigSrc) {
314         SubMerge[J] = PadReg;
315         continue;
316       }
317 
318       SubMerge[J] = VRegs[Idx];
319 
320       // There are meaningful bits here we can't reuse later.
321       AllMergePartsArePadding = false;
322     }
323 
324     // If we've filled up a complete piece with padding bits, we can directly
325     // emit the natural sized constant if applicable, rather than a merge of
326     // smaller constants.
327     if (AllMergePartsArePadding && !AllPadReg) {
328       if (PadStrategy == TargetOpcode::G_ANYEXT)
329         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
330       else if (PadStrategy == TargetOpcode::G_ZEXT)
331         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
332 
333       // If this is a sign extension, we can't materialize a trivial constant
334       // with the right type and have to produce a merge.
335     }
336 
337     if (AllPadReg) {
338       // Avoid creating additional instructions if we're just adding additional
339       // copies of padding bits.
340       Remerge[I] = AllPadReg;
341       continue;
342     }
343 
344     if (NumSubParts == 1)
345       Remerge[I] = SubMerge[0];
346     else
347       Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
348 
349     // In the sign extend padding case, re-use the first all-signbit merge.
350     if (AllMergePartsArePadding && !AllPadReg)
351       AllPadReg = Remerge[I];
352   }
353 
354   VRegs = std::move(Remerge);
355   return LCMTy;
356 }
357 
358 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
359                                                ArrayRef<Register> RemergeRegs) {
360   LLT DstTy = MRI.getType(DstReg);
361 
362   // Create the merge to the widened source, and extract the relevant bits into
363   // the result.
364 
365   if (DstTy == LCMTy) {
366     MIRBuilder.buildMerge(DstReg, RemergeRegs);
367     return;
368   }
369 
370   auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
371   if (DstTy.isScalar() && LCMTy.isScalar()) {
372     MIRBuilder.buildTrunc(DstReg, Remerge);
373     return;
374   }
375 
376   if (LCMTy.isVector()) {
377     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
378     SmallVector<Register, 8> UnmergeDefs(NumDefs);
379     UnmergeDefs[0] = DstReg;
380     for (unsigned I = 1; I != NumDefs; ++I)
381       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
382 
383     MIRBuilder.buildUnmerge(UnmergeDefs,
384                             MIRBuilder.buildMerge(LCMTy, RemergeRegs));
385     return;
386   }
387 
388   llvm_unreachable("unhandled case");
389 }
390 
391 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
392 #define RTLIBCASE_INT(LibcallPrefix)                                           \
393   do {                                                                         \
394     switch (Size) {                                                            \
395     case 32:                                                                   \
396       return RTLIB::LibcallPrefix##32;                                         \
397     case 64:                                                                   \
398       return RTLIB::LibcallPrefix##64;                                         \
399     case 128:                                                                  \
400       return RTLIB::LibcallPrefix##128;                                        \
401     default:                                                                   \
402       llvm_unreachable("unexpected size");                                     \
403     }                                                                          \
404   } while (0)
405 
406 #define RTLIBCASE(LibcallPrefix)                                               \
407   do {                                                                         \
408     switch (Size) {                                                            \
409     case 32:                                                                   \
410       return RTLIB::LibcallPrefix##32;                                         \
411     case 64:                                                                   \
412       return RTLIB::LibcallPrefix##64;                                         \
413     case 80:                                                                   \
414       return RTLIB::LibcallPrefix##80;                                         \
415     case 128:                                                                  \
416       return RTLIB::LibcallPrefix##128;                                        \
417     default:                                                                   \
418       llvm_unreachable("unexpected size");                                     \
419     }                                                                          \
420   } while (0)
421 
422   switch (Opcode) {
423   case TargetOpcode::G_SDIV:
424     RTLIBCASE_INT(SDIV_I);
425   case TargetOpcode::G_UDIV:
426     RTLIBCASE_INT(UDIV_I);
427   case TargetOpcode::G_SREM:
428     RTLIBCASE_INT(SREM_I);
429   case TargetOpcode::G_UREM:
430     RTLIBCASE_INT(UREM_I);
431   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
432     RTLIBCASE_INT(CTLZ_I);
433   case TargetOpcode::G_FADD:
434     RTLIBCASE(ADD_F);
435   case TargetOpcode::G_FSUB:
436     RTLIBCASE(SUB_F);
437   case TargetOpcode::G_FMUL:
438     RTLIBCASE(MUL_F);
439   case TargetOpcode::G_FDIV:
440     RTLIBCASE(DIV_F);
441   case TargetOpcode::G_FEXP:
442     RTLIBCASE(EXP_F);
443   case TargetOpcode::G_FEXP2:
444     RTLIBCASE(EXP2_F);
445   case TargetOpcode::G_FREM:
446     RTLIBCASE(REM_F);
447   case TargetOpcode::G_FPOW:
448     RTLIBCASE(POW_F);
449   case TargetOpcode::G_FMA:
450     RTLIBCASE(FMA_F);
451   case TargetOpcode::G_FSIN:
452     RTLIBCASE(SIN_F);
453   case TargetOpcode::G_FCOS:
454     RTLIBCASE(COS_F);
455   case TargetOpcode::G_FLOG10:
456     RTLIBCASE(LOG10_F);
457   case TargetOpcode::G_FLOG:
458     RTLIBCASE(LOG_F);
459   case TargetOpcode::G_FLOG2:
460     RTLIBCASE(LOG2_F);
461   case TargetOpcode::G_FCEIL:
462     RTLIBCASE(CEIL_F);
463   case TargetOpcode::G_FFLOOR:
464     RTLIBCASE(FLOOR_F);
465   case TargetOpcode::G_FMINNUM:
466     RTLIBCASE(FMIN_F);
467   case TargetOpcode::G_FMAXNUM:
468     RTLIBCASE(FMAX_F);
469   case TargetOpcode::G_FSQRT:
470     RTLIBCASE(SQRT_F);
471   case TargetOpcode::G_FRINT:
472     RTLIBCASE(RINT_F);
473   case TargetOpcode::G_FNEARBYINT:
474     RTLIBCASE(NEARBYINT_F);
475   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
476     RTLIBCASE(ROUNDEVEN_F);
477   }
478   llvm_unreachable("Unknown libcall function");
479 }
480 
481 /// True if an instruction is in tail position in its caller. Intended for
482 /// legalizing libcalls as tail calls when possible.
483 static bool isLibCallInTailPosition(const TargetInstrInfo &TII,
484                                     MachineInstr &MI) {
485   MachineBasicBlock &MBB = *MI.getParent();
486   const Function &F = MBB.getParent()->getFunction();
487 
488   // Conservatively require the attributes of the call to match those of
489   // the return. Ignore NoAlias and NonNull because they don't affect the
490   // call sequence.
491   AttributeList CallerAttrs = F.getAttributes();
492   if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
493           .removeAttribute(Attribute::NoAlias)
494           .removeAttribute(Attribute::NonNull)
495           .hasAttributes())
496     return false;
497 
498   // It's not safe to eliminate the sign / zero extension of the return value.
499   if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) ||
500       CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
501     return false;
502 
503   // Only tail call if the following instruction is a standard return.
504   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
505   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
506     return false;
507 
508   return true;
509 }
510 
511 LegalizerHelper::LegalizeResult
512 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
513                     const CallLowering::ArgInfo &Result,
514                     ArrayRef<CallLowering::ArgInfo> Args,
515                     const CallingConv::ID CC) {
516   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
517 
518   CallLowering::CallLoweringInfo Info;
519   Info.CallConv = CC;
520   Info.Callee = MachineOperand::CreateES(Name);
521   Info.OrigRet = Result;
522   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
523   if (!CLI.lowerCall(MIRBuilder, Info))
524     return LegalizerHelper::UnableToLegalize;
525 
526   return LegalizerHelper::Legalized;
527 }
528 
529 LegalizerHelper::LegalizeResult
530 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
531                     const CallLowering::ArgInfo &Result,
532                     ArrayRef<CallLowering::ArgInfo> Args) {
533   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
534   const char *Name = TLI.getLibcallName(Libcall);
535   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
536   return createLibcall(MIRBuilder, Name, Result, Args, CC);
537 }
538 
539 // Useful for libcalls where all operands have the same type.
540 static LegalizerHelper::LegalizeResult
541 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
542               Type *OpType) {
543   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
544 
545   // FIXME: What does the original arg index mean here?
546   SmallVector<CallLowering::ArgInfo, 3> Args;
547   for (unsigned i = 1; i < MI.getNumOperands(); i++)
548     Args.push_back({MI.getOperand(i).getReg(), OpType, 0});
549   return createLibcall(MIRBuilder, Libcall,
550                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
551 }
552 
553 LegalizerHelper::LegalizeResult
554 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
555                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
556   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
557 
558   SmallVector<CallLowering::ArgInfo, 3> Args;
559   // Add all the args, except for the last which is an imm denoting 'tail'.
560   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
561     Register Reg = MI.getOperand(i).getReg();
562 
563     // Need derive an IR type for call lowering.
564     LLT OpLLT = MRI.getType(Reg);
565     Type *OpTy = nullptr;
566     if (OpLLT.isPointer())
567       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
568     else
569       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
570     Args.push_back({Reg, OpTy, 0});
571   }
572 
573   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
574   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
575   RTLIB::Libcall RTLibcall;
576   unsigned Opc = MI.getOpcode();
577   switch (Opc) {
578   case TargetOpcode::G_BZERO:
579     RTLibcall = RTLIB::BZERO;
580     break;
581   case TargetOpcode::G_MEMCPY:
582     RTLibcall = RTLIB::MEMCPY;
583     break;
584   case TargetOpcode::G_MEMMOVE:
585     RTLibcall = RTLIB::MEMMOVE;
586     break;
587   case TargetOpcode::G_MEMSET:
588     RTLibcall = RTLIB::MEMSET;
589     break;
590   default:
591     return LegalizerHelper::UnableToLegalize;
592   }
593   const char *Name = TLI.getLibcallName(RTLibcall);
594 
595   // Unsupported libcall on the target.
596   if (!Name) {
597     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
598                       << MIRBuilder.getTII().getName(Opc) << "\n");
599     return LegalizerHelper::UnableToLegalize;
600   }
601 
602   CallLowering::CallLoweringInfo Info;
603   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
604   Info.Callee = MachineOperand::CreateES(Name);
605   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
606   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
607                     isLibCallInTailPosition(MIRBuilder.getTII(), MI);
608 
609   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
610   if (!CLI.lowerCall(MIRBuilder, Info))
611     return LegalizerHelper::UnableToLegalize;
612 
613 
614   if (Info.LoweredTailCall) {
615     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
616 
617     // Check debug locations before removing the return.
618     LocObserver.checkpoint(true);
619 
620     // We must have a return following the call (or debug insts) to get past
621     // isLibCallInTailPosition.
622     do {
623       MachineInstr *Next = MI.getNextNode();
624       assert(Next && (Next->isReturn() || Next->isDebugInstr()) &&
625              "Expected instr following MI to be return or debug inst?");
626       // We lowered a tail call, so the call is now the return from the block.
627       // Delete the old return.
628       Next->eraseFromParent();
629     } while (MI.getNextNode());
630 
631     // We expect to lose the debug location from the return.
632     LocObserver.checkpoint(false);
633   }
634 
635   return LegalizerHelper::Legalized;
636 }
637 
638 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
639                                        Type *FromType) {
640   auto ToMVT = MVT::getVT(ToType);
641   auto FromMVT = MVT::getVT(FromType);
642 
643   switch (Opcode) {
644   case TargetOpcode::G_FPEXT:
645     return RTLIB::getFPEXT(FromMVT, ToMVT);
646   case TargetOpcode::G_FPTRUNC:
647     return RTLIB::getFPROUND(FromMVT, ToMVT);
648   case TargetOpcode::G_FPTOSI:
649     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
650   case TargetOpcode::G_FPTOUI:
651     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
652   case TargetOpcode::G_SITOFP:
653     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
654   case TargetOpcode::G_UITOFP:
655     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
656   }
657   llvm_unreachable("Unsupported libcall function");
658 }
659 
660 static LegalizerHelper::LegalizeResult
661 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
662                   Type *FromType) {
663   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
664   return createLibcall(MIRBuilder, Libcall,
665                        {MI.getOperand(0).getReg(), ToType, 0},
666                        {{MI.getOperand(1).getReg(), FromType, 0}});
667 }
668 
669 LegalizerHelper::LegalizeResult
670 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
671   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
672   unsigned Size = LLTy.getSizeInBits();
673   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
674 
675   switch (MI.getOpcode()) {
676   default:
677     return UnableToLegalize;
678   case TargetOpcode::G_SDIV:
679   case TargetOpcode::G_UDIV:
680   case TargetOpcode::G_SREM:
681   case TargetOpcode::G_UREM:
682   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
683     Type *HLTy = IntegerType::get(Ctx, Size);
684     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
685     if (Status != Legalized)
686       return Status;
687     break;
688   }
689   case TargetOpcode::G_FADD:
690   case TargetOpcode::G_FSUB:
691   case TargetOpcode::G_FMUL:
692   case TargetOpcode::G_FDIV:
693   case TargetOpcode::G_FMA:
694   case TargetOpcode::G_FPOW:
695   case TargetOpcode::G_FREM:
696   case TargetOpcode::G_FCOS:
697   case TargetOpcode::G_FSIN:
698   case TargetOpcode::G_FLOG10:
699   case TargetOpcode::G_FLOG:
700   case TargetOpcode::G_FLOG2:
701   case TargetOpcode::G_FEXP:
702   case TargetOpcode::G_FEXP2:
703   case TargetOpcode::G_FCEIL:
704   case TargetOpcode::G_FFLOOR:
705   case TargetOpcode::G_FMINNUM:
706   case TargetOpcode::G_FMAXNUM:
707   case TargetOpcode::G_FSQRT:
708   case TargetOpcode::G_FRINT:
709   case TargetOpcode::G_FNEARBYINT:
710   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
711     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
712     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
713       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
714       return UnableToLegalize;
715     }
716     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
717     if (Status != Legalized)
718       return Status;
719     break;
720   }
721   case TargetOpcode::G_FPEXT:
722   case TargetOpcode::G_FPTRUNC: {
723     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
724     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
725     if (!FromTy || !ToTy)
726       return UnableToLegalize;
727     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
728     if (Status != Legalized)
729       return Status;
730     break;
731   }
732   case TargetOpcode::G_FPTOSI:
733   case TargetOpcode::G_FPTOUI: {
734     // FIXME: Support other types
735     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
736     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
737     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
738       return UnableToLegalize;
739     LegalizeResult Status = conversionLibcall(
740         MI, MIRBuilder,
741         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
742         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
743     if (Status != Legalized)
744       return Status;
745     break;
746   }
747   case TargetOpcode::G_SITOFP:
748   case TargetOpcode::G_UITOFP: {
749     // FIXME: Support other types
750     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
751     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
752     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
753       return UnableToLegalize;
754     LegalizeResult Status = conversionLibcall(
755         MI, MIRBuilder,
756         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
757         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
758     if (Status != Legalized)
759       return Status;
760     break;
761   }
762   case TargetOpcode::G_BZERO:
763   case TargetOpcode::G_MEMCPY:
764   case TargetOpcode::G_MEMMOVE:
765   case TargetOpcode::G_MEMSET: {
766     LegalizeResult Result =
767         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
768     if (Result != Legalized)
769       return Result;
770     MI.eraseFromParent();
771     return Result;
772   }
773   }
774 
775   MI.eraseFromParent();
776   return Legalized;
777 }
778 
779 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
780                                                               unsigned TypeIdx,
781                                                               LLT NarrowTy) {
782   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
783   uint64_t NarrowSize = NarrowTy.getSizeInBits();
784 
785   switch (MI.getOpcode()) {
786   default:
787     return UnableToLegalize;
788   case TargetOpcode::G_IMPLICIT_DEF: {
789     Register DstReg = MI.getOperand(0).getReg();
790     LLT DstTy = MRI.getType(DstReg);
791 
792     // If SizeOp0 is not an exact multiple of NarrowSize, emit
793     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
794     // FIXME: Although this would also be legal for the general case, it causes
795     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
796     //  combines not being hit). This seems to be a problem related to the
797     //  artifact combiner.
798     if (SizeOp0 % NarrowSize != 0) {
799       LLT ImplicitTy = NarrowTy;
800       if (DstTy.isVector())
801         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
802 
803       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
804       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
805 
806       MI.eraseFromParent();
807       return Legalized;
808     }
809 
810     int NumParts = SizeOp0 / NarrowSize;
811 
812     SmallVector<Register, 2> DstRegs;
813     for (int i = 0; i < NumParts; ++i)
814       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
815 
816     if (DstTy.isVector())
817       MIRBuilder.buildBuildVector(DstReg, DstRegs);
818     else
819       MIRBuilder.buildMerge(DstReg, DstRegs);
820     MI.eraseFromParent();
821     return Legalized;
822   }
823   case TargetOpcode::G_CONSTANT: {
824     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
825     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
826     unsigned TotalSize = Ty.getSizeInBits();
827     unsigned NarrowSize = NarrowTy.getSizeInBits();
828     int NumParts = TotalSize / NarrowSize;
829 
830     SmallVector<Register, 4> PartRegs;
831     for (int I = 0; I != NumParts; ++I) {
832       unsigned Offset = I * NarrowSize;
833       auto K = MIRBuilder.buildConstant(NarrowTy,
834                                         Val.lshr(Offset).trunc(NarrowSize));
835       PartRegs.push_back(K.getReg(0));
836     }
837 
838     LLT LeftoverTy;
839     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
840     SmallVector<Register, 1> LeftoverRegs;
841     if (LeftoverBits != 0) {
842       LeftoverTy = LLT::scalar(LeftoverBits);
843       auto K = MIRBuilder.buildConstant(
844         LeftoverTy,
845         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
846       LeftoverRegs.push_back(K.getReg(0));
847     }
848 
849     insertParts(MI.getOperand(0).getReg(),
850                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
851 
852     MI.eraseFromParent();
853     return Legalized;
854   }
855   case TargetOpcode::G_SEXT:
856   case TargetOpcode::G_ZEXT:
857   case TargetOpcode::G_ANYEXT:
858     return narrowScalarExt(MI, TypeIdx, NarrowTy);
859   case TargetOpcode::G_TRUNC: {
860     if (TypeIdx != 1)
861       return UnableToLegalize;
862 
863     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
864     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
865       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
866       return UnableToLegalize;
867     }
868 
869     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
870     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
871     MI.eraseFromParent();
872     return Legalized;
873   }
874 
875   case TargetOpcode::G_FREEZE:
876     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
877   case TargetOpcode::G_ADD:
878   case TargetOpcode::G_SUB:
879   case TargetOpcode::G_SADDO:
880   case TargetOpcode::G_SSUBO:
881   case TargetOpcode::G_SADDE:
882   case TargetOpcode::G_SSUBE:
883   case TargetOpcode::G_UADDO:
884   case TargetOpcode::G_USUBO:
885   case TargetOpcode::G_UADDE:
886   case TargetOpcode::G_USUBE:
887     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
888   case TargetOpcode::G_MUL:
889   case TargetOpcode::G_UMULH:
890     return narrowScalarMul(MI, NarrowTy);
891   case TargetOpcode::G_EXTRACT:
892     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
893   case TargetOpcode::G_INSERT:
894     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
895   case TargetOpcode::G_LOAD: {
896     auto &MMO = **MI.memoperands_begin();
897     Register DstReg = MI.getOperand(0).getReg();
898     LLT DstTy = MRI.getType(DstReg);
899     if (DstTy.isVector())
900       return UnableToLegalize;
901 
902     if (8 * MMO.getSize() != DstTy.getSizeInBits()) {
903       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
904       MIRBuilder.buildLoad(TmpReg, MI.getOperand(1), MMO);
905       MIRBuilder.buildAnyExt(DstReg, TmpReg);
906       MI.eraseFromParent();
907       return Legalized;
908     }
909 
910     return reduceLoadStoreWidth(MI, TypeIdx, NarrowTy);
911   }
912   case TargetOpcode::G_ZEXTLOAD:
913   case TargetOpcode::G_SEXTLOAD: {
914     bool ZExt = MI.getOpcode() == TargetOpcode::G_ZEXTLOAD;
915     Register DstReg = MI.getOperand(0).getReg();
916     Register PtrReg = MI.getOperand(1).getReg();
917 
918     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
919     auto &MMO = **MI.memoperands_begin();
920     unsigned MemSize = MMO.getSizeInBits();
921 
922     if (MemSize == NarrowSize) {
923       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
924     } else if (MemSize < NarrowSize) {
925       MIRBuilder.buildLoadInstr(MI.getOpcode(), TmpReg, PtrReg, MMO);
926     } else if (MemSize > NarrowSize) {
927       // FIXME: Need to split the load.
928       return UnableToLegalize;
929     }
930 
931     if (ZExt)
932       MIRBuilder.buildZExt(DstReg, TmpReg);
933     else
934       MIRBuilder.buildSExt(DstReg, TmpReg);
935 
936     MI.eraseFromParent();
937     return Legalized;
938   }
939   case TargetOpcode::G_STORE: {
940     const auto &MMO = **MI.memoperands_begin();
941 
942     Register SrcReg = MI.getOperand(0).getReg();
943     LLT SrcTy = MRI.getType(SrcReg);
944     if (SrcTy.isVector())
945       return UnableToLegalize;
946 
947     int NumParts = SizeOp0 / NarrowSize;
948     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
949     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
950     if (SrcTy.isVector() && LeftoverBits != 0)
951       return UnableToLegalize;
952 
953     if (8 * MMO.getSize() != SrcTy.getSizeInBits()) {
954       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
955       auto &MMO = **MI.memoperands_begin();
956       MIRBuilder.buildTrunc(TmpReg, SrcReg);
957       MIRBuilder.buildStore(TmpReg, MI.getOperand(1), MMO);
958       MI.eraseFromParent();
959       return Legalized;
960     }
961 
962     return reduceLoadStoreWidth(MI, 0, NarrowTy);
963   }
964   case TargetOpcode::G_SELECT:
965     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
966   case TargetOpcode::G_AND:
967   case TargetOpcode::G_OR:
968   case TargetOpcode::G_XOR: {
969     // Legalize bitwise operation:
970     // A = BinOp<Ty> B, C
971     // into:
972     // B1, ..., BN = G_UNMERGE_VALUES B
973     // C1, ..., CN = G_UNMERGE_VALUES C
974     // A1 = BinOp<Ty/N> B1, C2
975     // ...
976     // AN = BinOp<Ty/N> BN, CN
977     // A = G_MERGE_VALUES A1, ..., AN
978     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
979   }
980   case TargetOpcode::G_SHL:
981   case TargetOpcode::G_LSHR:
982   case TargetOpcode::G_ASHR:
983     return narrowScalarShift(MI, TypeIdx, NarrowTy);
984   case TargetOpcode::G_CTLZ:
985   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
986   case TargetOpcode::G_CTTZ:
987   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
988   case TargetOpcode::G_CTPOP:
989     if (TypeIdx == 1)
990       switch (MI.getOpcode()) {
991       case TargetOpcode::G_CTLZ:
992       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
993         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
994       case TargetOpcode::G_CTTZ:
995       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
996         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
997       case TargetOpcode::G_CTPOP:
998         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
999       default:
1000         return UnableToLegalize;
1001       }
1002 
1003     Observer.changingInstr(MI);
1004     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1005     Observer.changedInstr(MI);
1006     return Legalized;
1007   case TargetOpcode::G_INTTOPTR:
1008     if (TypeIdx != 1)
1009       return UnableToLegalize;
1010 
1011     Observer.changingInstr(MI);
1012     narrowScalarSrc(MI, NarrowTy, 1);
1013     Observer.changedInstr(MI);
1014     return Legalized;
1015   case TargetOpcode::G_PTRTOINT:
1016     if (TypeIdx != 0)
1017       return UnableToLegalize;
1018 
1019     Observer.changingInstr(MI);
1020     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1021     Observer.changedInstr(MI);
1022     return Legalized;
1023   case TargetOpcode::G_PHI: {
1024     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1025     // NarrowSize.
1026     if (SizeOp0 % NarrowSize != 0)
1027       return UnableToLegalize;
1028 
1029     unsigned NumParts = SizeOp0 / NarrowSize;
1030     SmallVector<Register, 2> DstRegs(NumParts);
1031     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1032     Observer.changingInstr(MI);
1033     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1034       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1035       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1036       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1037                    SrcRegs[i / 2]);
1038     }
1039     MachineBasicBlock &MBB = *MI.getParent();
1040     MIRBuilder.setInsertPt(MBB, MI);
1041     for (unsigned i = 0; i < NumParts; ++i) {
1042       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1043       MachineInstrBuilder MIB =
1044           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1045       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1046         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1047     }
1048     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1049     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1050     Observer.changedInstr(MI);
1051     MI.eraseFromParent();
1052     return Legalized;
1053   }
1054   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1055   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1056     if (TypeIdx != 2)
1057       return UnableToLegalize;
1058 
1059     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1060     Observer.changingInstr(MI);
1061     narrowScalarSrc(MI, NarrowTy, OpIdx);
1062     Observer.changedInstr(MI);
1063     return Legalized;
1064   }
1065   case TargetOpcode::G_ICMP: {
1066     Register LHS = MI.getOperand(2).getReg();
1067     LLT SrcTy = MRI.getType(LHS);
1068     uint64_t SrcSize = SrcTy.getSizeInBits();
1069     CmpInst::Predicate Pred =
1070         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1071 
1072     // TODO: Handle the non-equality case for weird sizes.
1073     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1074       return UnableToLegalize;
1075 
1076     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1077     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1078     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1079                       LHSLeftoverRegs))
1080       return UnableToLegalize;
1081 
1082     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1083     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1084     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1085                       RHSPartRegs, RHSLeftoverRegs))
1086       return UnableToLegalize;
1087 
1088     // We now have the LHS and RHS of the compare split into narrow-type
1089     // registers, plus potentially some leftover type.
1090     Register Dst = MI.getOperand(0).getReg();
1091     LLT ResTy = MRI.getType(Dst);
1092     if (ICmpInst::isEquality(Pred)) {
1093       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1094       // them together. For each equal part, the result should be all 0s. For
1095       // each non-equal part, we'll get at least one 1.
1096       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1097       SmallVector<Register, 4> Xors;
1098       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1099         auto LHS = std::get<0>(LHSAndRHS);
1100         auto RHS = std::get<1>(LHSAndRHS);
1101         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1102         Xors.push_back(Xor);
1103       }
1104 
1105       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1106       // to the desired narrow type so that we can OR them together later.
1107       SmallVector<Register, 4> WidenedXors;
1108       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1109         auto LHS = std::get<0>(LHSAndRHS);
1110         auto RHS = std::get<1>(LHSAndRHS);
1111         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1112         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1113         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1114                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1115         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1116       }
1117 
1118       // Now, for each part we broke up, we know if they are equal/not equal
1119       // based off the G_XOR. We can OR these all together and compare against
1120       // 0 to get the result.
1121       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1122       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1123       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1124         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1125       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1126     } else {
1127       // TODO: Handle non-power-of-two types.
1128       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1129       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1130       Register LHSL = LHSPartRegs[0];
1131       Register LHSH = LHSPartRegs[1];
1132       Register RHSL = RHSPartRegs[0];
1133       Register RHSH = RHSPartRegs[1];
1134       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1135       MachineInstrBuilder CmpHEQ =
1136           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1137       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1138           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1139       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1140     }
1141     MI.eraseFromParent();
1142     return Legalized;
1143   }
1144   case TargetOpcode::G_SEXT_INREG: {
1145     if (TypeIdx != 0)
1146       return UnableToLegalize;
1147 
1148     int64_t SizeInBits = MI.getOperand(2).getImm();
1149 
1150     // So long as the new type has more bits than the bits we're extending we
1151     // don't need to break it apart.
1152     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1153       Observer.changingInstr(MI);
1154       // We don't lose any non-extension bits by truncating the src and
1155       // sign-extending the dst.
1156       MachineOperand &MO1 = MI.getOperand(1);
1157       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1158       MO1.setReg(TruncMIB.getReg(0));
1159 
1160       MachineOperand &MO2 = MI.getOperand(0);
1161       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1162       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1163       MIRBuilder.buildSExt(MO2, DstExt);
1164       MO2.setReg(DstExt);
1165       Observer.changedInstr(MI);
1166       return Legalized;
1167     }
1168 
1169     // Break it apart. Components below the extension point are unmodified. The
1170     // component containing the extension point becomes a narrower SEXT_INREG.
1171     // Components above it are ashr'd from the component containing the
1172     // extension point.
1173     if (SizeOp0 % NarrowSize != 0)
1174       return UnableToLegalize;
1175     int NumParts = SizeOp0 / NarrowSize;
1176 
1177     // List the registers where the destination will be scattered.
1178     SmallVector<Register, 2> DstRegs;
1179     // List the registers where the source will be split.
1180     SmallVector<Register, 2> SrcRegs;
1181 
1182     // Create all the temporary registers.
1183     for (int i = 0; i < NumParts; ++i) {
1184       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1185 
1186       SrcRegs.push_back(SrcReg);
1187     }
1188 
1189     // Explode the big arguments into smaller chunks.
1190     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1191 
1192     Register AshrCstReg =
1193         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1194             .getReg(0);
1195     Register FullExtensionReg = 0;
1196     Register PartialExtensionReg = 0;
1197 
1198     // Do the operation on each small part.
1199     for (int i = 0; i < NumParts; ++i) {
1200       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1201         DstRegs.push_back(SrcRegs[i]);
1202       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1203         assert(PartialExtensionReg &&
1204                "Expected to visit partial extension before full");
1205         if (FullExtensionReg) {
1206           DstRegs.push_back(FullExtensionReg);
1207           continue;
1208         }
1209         DstRegs.push_back(
1210             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1211                 .getReg(0));
1212         FullExtensionReg = DstRegs.back();
1213       } else {
1214         DstRegs.push_back(
1215             MIRBuilder
1216                 .buildInstr(
1217                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1218                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1219                 .getReg(0));
1220         PartialExtensionReg = DstRegs.back();
1221       }
1222     }
1223 
1224     // Gather the destination registers into the final destination.
1225     Register DstReg = MI.getOperand(0).getReg();
1226     MIRBuilder.buildMerge(DstReg, DstRegs);
1227     MI.eraseFromParent();
1228     return Legalized;
1229   }
1230   case TargetOpcode::G_BSWAP:
1231   case TargetOpcode::G_BITREVERSE: {
1232     if (SizeOp0 % NarrowSize != 0)
1233       return UnableToLegalize;
1234 
1235     Observer.changingInstr(MI);
1236     SmallVector<Register, 2> SrcRegs, DstRegs;
1237     unsigned NumParts = SizeOp0 / NarrowSize;
1238     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1239 
1240     for (unsigned i = 0; i < NumParts; ++i) {
1241       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1242                                            {SrcRegs[NumParts - 1 - i]});
1243       DstRegs.push_back(DstPart.getReg(0));
1244     }
1245 
1246     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1247 
1248     Observer.changedInstr(MI);
1249     MI.eraseFromParent();
1250     return Legalized;
1251   }
1252   case TargetOpcode::G_PTR_ADD:
1253   case TargetOpcode::G_PTRMASK: {
1254     if (TypeIdx != 1)
1255       return UnableToLegalize;
1256     Observer.changingInstr(MI);
1257     narrowScalarSrc(MI, NarrowTy, 2);
1258     Observer.changedInstr(MI);
1259     return Legalized;
1260   }
1261   case TargetOpcode::G_FPTOUI:
1262   case TargetOpcode::G_FPTOSI:
1263     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1264   case TargetOpcode::G_FPEXT:
1265     if (TypeIdx != 0)
1266       return UnableToLegalize;
1267     Observer.changingInstr(MI);
1268     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1269     Observer.changedInstr(MI);
1270     return Legalized;
1271   }
1272 }
1273 
1274 Register LegalizerHelper::coerceToScalar(Register Val) {
1275   LLT Ty = MRI.getType(Val);
1276   if (Ty.isScalar())
1277     return Val;
1278 
1279   const DataLayout &DL = MIRBuilder.getDataLayout();
1280   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1281   if (Ty.isPointer()) {
1282     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1283       return Register();
1284     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1285   }
1286 
1287   Register NewVal = Val;
1288 
1289   assert(Ty.isVector());
1290   LLT EltTy = Ty.getElementType();
1291   if (EltTy.isPointer())
1292     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1293   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1294 }
1295 
1296 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1297                                      unsigned OpIdx, unsigned ExtOpcode) {
1298   MachineOperand &MO = MI.getOperand(OpIdx);
1299   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1300   MO.setReg(ExtB.getReg(0));
1301 }
1302 
1303 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1304                                       unsigned OpIdx) {
1305   MachineOperand &MO = MI.getOperand(OpIdx);
1306   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1307   MO.setReg(ExtB.getReg(0));
1308 }
1309 
1310 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1311                                      unsigned OpIdx, unsigned TruncOpcode) {
1312   MachineOperand &MO = MI.getOperand(OpIdx);
1313   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1314   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1315   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1316   MO.setReg(DstExt);
1317 }
1318 
1319 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1320                                       unsigned OpIdx, unsigned ExtOpcode) {
1321   MachineOperand &MO = MI.getOperand(OpIdx);
1322   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1323   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1324   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1325   MO.setReg(DstTrunc);
1326 }
1327 
1328 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1329                                             unsigned OpIdx) {
1330   MachineOperand &MO = MI.getOperand(OpIdx);
1331   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1332   MO.setReg(widenWithUnmerge(WideTy, MO.getReg()));
1333 }
1334 
1335 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1336                                             unsigned OpIdx) {
1337   MachineOperand &MO = MI.getOperand(OpIdx);
1338 
1339   LLT OldTy = MRI.getType(MO.getReg());
1340   unsigned OldElts = OldTy.getNumElements();
1341   unsigned NewElts = MoreTy.getNumElements();
1342 
1343   unsigned NumParts = NewElts / OldElts;
1344 
1345   // Use concat_vectors if the result is a multiple of the number of elements.
1346   if (NumParts * OldElts == NewElts) {
1347     SmallVector<Register, 8> Parts;
1348     Parts.push_back(MO.getReg());
1349 
1350     Register ImpDef = MIRBuilder.buildUndef(OldTy).getReg(0);
1351     for (unsigned I = 1; I != NumParts; ++I)
1352       Parts.push_back(ImpDef);
1353 
1354     auto Concat = MIRBuilder.buildConcatVectors(MoreTy, Parts);
1355     MO.setReg(Concat.getReg(0));
1356     return;
1357   }
1358 
1359   Register MoreReg = MRI.createGenericVirtualRegister(MoreTy);
1360   Register ImpDef = MIRBuilder.buildUndef(MoreTy).getReg(0);
1361   MIRBuilder.buildInsert(MoreReg, ImpDef, MO.getReg(), 0);
1362   MO.setReg(MoreReg);
1363 }
1364 
1365 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1366   MachineOperand &Op = MI.getOperand(OpIdx);
1367   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1368 }
1369 
1370 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1371   MachineOperand &MO = MI.getOperand(OpIdx);
1372   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1373   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1374   MIRBuilder.buildBitcast(MO, CastDst);
1375   MO.setReg(CastDst);
1376 }
1377 
1378 LegalizerHelper::LegalizeResult
1379 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1380                                         LLT WideTy) {
1381   if (TypeIdx != 1)
1382     return UnableToLegalize;
1383 
1384   Register DstReg = MI.getOperand(0).getReg();
1385   LLT DstTy = MRI.getType(DstReg);
1386   if (DstTy.isVector())
1387     return UnableToLegalize;
1388 
1389   Register Src1 = MI.getOperand(1).getReg();
1390   LLT SrcTy = MRI.getType(Src1);
1391   const int DstSize = DstTy.getSizeInBits();
1392   const int SrcSize = SrcTy.getSizeInBits();
1393   const int WideSize = WideTy.getSizeInBits();
1394   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1395 
1396   unsigned NumOps = MI.getNumOperands();
1397   unsigned NumSrc = MI.getNumOperands() - 1;
1398   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1399 
1400   if (WideSize >= DstSize) {
1401     // Directly pack the bits in the target type.
1402     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1403 
1404     for (unsigned I = 2; I != NumOps; ++I) {
1405       const unsigned Offset = (I - 1) * PartSize;
1406 
1407       Register SrcReg = MI.getOperand(I).getReg();
1408       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1409 
1410       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1411 
1412       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1413         MRI.createGenericVirtualRegister(WideTy);
1414 
1415       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1416       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1417       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1418       ResultReg = NextResult;
1419     }
1420 
1421     if (WideSize > DstSize)
1422       MIRBuilder.buildTrunc(DstReg, ResultReg);
1423     else if (DstTy.isPointer())
1424       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1425 
1426     MI.eraseFromParent();
1427     return Legalized;
1428   }
1429 
1430   // Unmerge the original values to the GCD type, and recombine to the next
1431   // multiple greater than the original type.
1432   //
1433   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1434   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1435   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1436   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1437   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1438   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1439   // %12:_(s12) = G_MERGE_VALUES %10, %11
1440   //
1441   // Padding with undef if necessary:
1442   //
1443   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1444   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1445   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1446   // %7:_(s2) = G_IMPLICIT_DEF
1447   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1448   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1449   // %10:_(s12) = G_MERGE_VALUES %8, %9
1450 
1451   const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1452   LLT GCDTy = LLT::scalar(GCD);
1453 
1454   SmallVector<Register, 8> Parts;
1455   SmallVector<Register, 8> NewMergeRegs;
1456   SmallVector<Register, 8> Unmerges;
1457   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1458 
1459   // Decompose the original operands if they don't evenly divide.
1460   for (int I = 1, E = MI.getNumOperands(); I != E; ++I) {
1461     Register SrcReg = MI.getOperand(I).getReg();
1462     if (GCD == SrcSize) {
1463       Unmerges.push_back(SrcReg);
1464     } else {
1465       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1466       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1467         Unmerges.push_back(Unmerge.getReg(J));
1468     }
1469   }
1470 
1471   // Pad with undef to the next size that is a multiple of the requested size.
1472   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1473     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1474     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1475       Unmerges.push_back(UndefReg);
1476   }
1477 
1478   const int PartsPerGCD = WideSize / GCD;
1479 
1480   // Build merges of each piece.
1481   ArrayRef<Register> Slicer(Unmerges);
1482   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1483     auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1484     NewMergeRegs.push_back(Merge.getReg(0));
1485   }
1486 
1487   // A truncate may be necessary if the requested type doesn't evenly divide the
1488   // original result type.
1489   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1490     MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1491   } else {
1492     auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1493     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1494   }
1495 
1496   MI.eraseFromParent();
1497   return Legalized;
1498 }
1499 
1500 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
1501   Register WideReg = MRI.createGenericVirtualRegister(WideTy);
1502   LLT OrigTy = MRI.getType(OrigReg);
1503   LLT LCMTy = getLCMType(WideTy, OrigTy);
1504 
1505   const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
1506   const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
1507 
1508   Register UnmergeSrc = WideReg;
1509 
1510   // Create a merge to the LCM type, padding with undef
1511   // %0:_(<3 x s32>) = G_FOO => <4 x s32>
1512   // =>
1513   // %1:_(<4 x s32>) = G_FOO
1514   // %2:_(<4 x s32>) = G_IMPLICIT_DEF
1515   // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
1516   // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
1517   if (NumMergeParts > 1) {
1518     Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
1519     SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
1520     MergeParts[0] = WideReg;
1521     UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
1522   }
1523 
1524   // Unmerge to the original register and pad with dead defs.
1525   SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
1526   UnmergeResults[0] = OrigReg;
1527   for (int I = 1; I != NumUnmergeParts; ++I)
1528     UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
1529 
1530   MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
1531   return WideReg;
1532 }
1533 
1534 LegalizerHelper::LegalizeResult
1535 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1536                                           LLT WideTy) {
1537   if (TypeIdx != 0)
1538     return UnableToLegalize;
1539 
1540   int NumDst = MI.getNumOperands() - 1;
1541   Register SrcReg = MI.getOperand(NumDst).getReg();
1542   LLT SrcTy = MRI.getType(SrcReg);
1543   if (SrcTy.isVector())
1544     return UnableToLegalize;
1545 
1546   Register Dst0Reg = MI.getOperand(0).getReg();
1547   LLT DstTy = MRI.getType(Dst0Reg);
1548   if (!DstTy.isScalar())
1549     return UnableToLegalize;
1550 
1551   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1552     if (SrcTy.isPointer()) {
1553       const DataLayout &DL = MIRBuilder.getDataLayout();
1554       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1555         LLVM_DEBUG(
1556             dbgs() << "Not casting non-integral address space integer\n");
1557         return UnableToLegalize;
1558       }
1559 
1560       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1561       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1562     }
1563 
1564     // Widen SrcTy to WideTy. This does not affect the result, but since the
1565     // user requested this size, it is probably better handled than SrcTy and
1566     // should reduce the total number of legalization artifacts
1567     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1568       SrcTy = WideTy;
1569       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1570     }
1571 
1572     // Theres no unmerge type to target. Directly extract the bits from the
1573     // source type
1574     unsigned DstSize = DstTy.getSizeInBits();
1575 
1576     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1577     for (int I = 1; I != NumDst; ++I) {
1578       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1579       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1580       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1581     }
1582 
1583     MI.eraseFromParent();
1584     return Legalized;
1585   }
1586 
1587   // Extend the source to a wider type.
1588   LLT LCMTy = getLCMType(SrcTy, WideTy);
1589 
1590   Register WideSrc = SrcReg;
1591   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1592     // TODO: If this is an integral address space, cast to integer and anyext.
1593     if (SrcTy.isPointer()) {
1594       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1595       return UnableToLegalize;
1596     }
1597 
1598     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1599   }
1600 
1601   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1602 
1603   // Create a sequence of unmerges and merges to the original results. Since we
1604   // may have widened the source, we will need to pad the results with dead defs
1605   // to cover the source register.
1606   // e.g. widen s48 to s64:
1607   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1608   //
1609   // =>
1610   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1611   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1612   //  ; unpack to GCD type, with extra dead defs
1613   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1614   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1615   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1616   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1617   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1618   const LLT GCDTy = getGCDType(WideTy, DstTy);
1619   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1620   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1621 
1622   // Directly unmerge to the destination without going through a GCD type
1623   // if possible
1624   if (PartsPerRemerge == 1) {
1625     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1626 
1627     for (int I = 0; I != NumUnmerge; ++I) {
1628       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1629 
1630       for (int J = 0; J != PartsPerUnmerge; ++J) {
1631         int Idx = I * PartsPerUnmerge + J;
1632         if (Idx < NumDst)
1633           MIB.addDef(MI.getOperand(Idx).getReg());
1634         else {
1635           // Create dead def for excess components.
1636           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1637         }
1638       }
1639 
1640       MIB.addUse(Unmerge.getReg(I));
1641     }
1642   } else {
1643     SmallVector<Register, 16> Parts;
1644     for (int J = 0; J != NumUnmerge; ++J)
1645       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1646 
1647     SmallVector<Register, 8> RemergeParts;
1648     for (int I = 0; I != NumDst; ++I) {
1649       for (int J = 0; J < PartsPerRemerge; ++J) {
1650         const int Idx = I * PartsPerRemerge + J;
1651         RemergeParts.emplace_back(Parts[Idx]);
1652       }
1653 
1654       MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1655       RemergeParts.clear();
1656     }
1657   }
1658 
1659   MI.eraseFromParent();
1660   return Legalized;
1661 }
1662 
1663 LegalizerHelper::LegalizeResult
1664 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1665                                     LLT WideTy) {
1666   Register DstReg = MI.getOperand(0).getReg();
1667   Register SrcReg = MI.getOperand(1).getReg();
1668   LLT SrcTy = MRI.getType(SrcReg);
1669 
1670   LLT DstTy = MRI.getType(DstReg);
1671   unsigned Offset = MI.getOperand(2).getImm();
1672 
1673   if (TypeIdx == 0) {
1674     if (SrcTy.isVector() || DstTy.isVector())
1675       return UnableToLegalize;
1676 
1677     SrcOp Src(SrcReg);
1678     if (SrcTy.isPointer()) {
1679       // Extracts from pointers can be handled only if they are really just
1680       // simple integers.
1681       const DataLayout &DL = MIRBuilder.getDataLayout();
1682       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1683         return UnableToLegalize;
1684 
1685       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1686       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1687       SrcTy = SrcAsIntTy;
1688     }
1689 
1690     if (DstTy.isPointer())
1691       return UnableToLegalize;
1692 
1693     if (Offset == 0) {
1694       // Avoid a shift in the degenerate case.
1695       MIRBuilder.buildTrunc(DstReg,
1696                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1697       MI.eraseFromParent();
1698       return Legalized;
1699     }
1700 
1701     // Do a shift in the source type.
1702     LLT ShiftTy = SrcTy;
1703     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1704       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1705       ShiftTy = WideTy;
1706     }
1707 
1708     auto LShr = MIRBuilder.buildLShr(
1709       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1710     MIRBuilder.buildTrunc(DstReg, LShr);
1711     MI.eraseFromParent();
1712     return Legalized;
1713   }
1714 
1715   if (SrcTy.isScalar()) {
1716     Observer.changingInstr(MI);
1717     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1718     Observer.changedInstr(MI);
1719     return Legalized;
1720   }
1721 
1722   if (!SrcTy.isVector())
1723     return UnableToLegalize;
1724 
1725   if (DstTy != SrcTy.getElementType())
1726     return UnableToLegalize;
1727 
1728   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1729     return UnableToLegalize;
1730 
1731   Observer.changingInstr(MI);
1732   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1733 
1734   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1735                           Offset);
1736   widenScalarDst(MI, WideTy.getScalarType(), 0);
1737   Observer.changedInstr(MI);
1738   return Legalized;
1739 }
1740 
1741 LegalizerHelper::LegalizeResult
1742 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1743                                    LLT WideTy) {
1744   if (TypeIdx != 0 || WideTy.isVector())
1745     return UnableToLegalize;
1746   Observer.changingInstr(MI);
1747   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1748   widenScalarDst(MI, WideTy);
1749   Observer.changedInstr(MI);
1750   return Legalized;
1751 }
1752 
1753 LegalizerHelper::LegalizeResult
1754 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1755                                            LLT WideTy) {
1756   if (TypeIdx == 1)
1757     return UnableToLegalize; // TODO
1758 
1759   unsigned Opcode;
1760   unsigned ExtOpcode;
1761   Optional<Register> CarryIn = None;
1762   switch (MI.getOpcode()) {
1763   default:
1764     llvm_unreachable("Unexpected opcode!");
1765   case TargetOpcode::G_SADDO:
1766     Opcode = TargetOpcode::G_ADD;
1767     ExtOpcode = TargetOpcode::G_SEXT;
1768     break;
1769   case TargetOpcode::G_SSUBO:
1770     Opcode = TargetOpcode::G_SUB;
1771     ExtOpcode = TargetOpcode::G_SEXT;
1772     break;
1773   case TargetOpcode::G_UADDO:
1774     Opcode = TargetOpcode::G_ADD;
1775     ExtOpcode = TargetOpcode::G_ZEXT;
1776     break;
1777   case TargetOpcode::G_USUBO:
1778     Opcode = TargetOpcode::G_SUB;
1779     ExtOpcode = TargetOpcode::G_ZEXT;
1780     break;
1781   case TargetOpcode::G_SADDE:
1782     Opcode = TargetOpcode::G_UADDE;
1783     ExtOpcode = TargetOpcode::G_SEXT;
1784     CarryIn = MI.getOperand(4).getReg();
1785     break;
1786   case TargetOpcode::G_SSUBE:
1787     Opcode = TargetOpcode::G_USUBE;
1788     ExtOpcode = TargetOpcode::G_SEXT;
1789     CarryIn = MI.getOperand(4).getReg();
1790     break;
1791   case TargetOpcode::G_UADDE:
1792     Opcode = TargetOpcode::G_UADDE;
1793     ExtOpcode = TargetOpcode::G_ZEXT;
1794     CarryIn = MI.getOperand(4).getReg();
1795     break;
1796   case TargetOpcode::G_USUBE:
1797     Opcode = TargetOpcode::G_USUBE;
1798     ExtOpcode = TargetOpcode::G_ZEXT;
1799     CarryIn = MI.getOperand(4).getReg();
1800     break;
1801   }
1802 
1803   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1804   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1805   // Do the arithmetic in the larger type.
1806   Register NewOp;
1807   if (CarryIn) {
1808     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1809     NewOp = MIRBuilder
1810                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1811                             {LHSExt, RHSExt, *CarryIn})
1812                 .getReg(0);
1813   } else {
1814     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1815   }
1816   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1817   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1818   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1819   // There is no overflow if the ExtOp is the same as NewOp.
1820   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1821   // Now trunc the NewOp to the original result.
1822   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1823   MI.eraseFromParent();
1824   return Legalized;
1825 }
1826 
1827 LegalizerHelper::LegalizeResult
1828 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1829                                          LLT WideTy) {
1830   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1831                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1832                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1833   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1834                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1835   // We can convert this to:
1836   //   1. Any extend iN to iM
1837   //   2. SHL by M-N
1838   //   3. [US][ADD|SUB|SHL]SAT
1839   //   4. L/ASHR by M-N
1840   //
1841   // It may be more efficient to lower this to a min and a max operation in
1842   // the higher precision arithmetic if the promoted operation isn't legal,
1843   // but this decision is up to the target's lowering request.
1844   Register DstReg = MI.getOperand(0).getReg();
1845 
1846   unsigned NewBits = WideTy.getScalarSizeInBits();
1847   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1848 
1849   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1850   // must not left shift the RHS to preserve the shift amount.
1851   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1852   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1853                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1854   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1855   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1856   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1857 
1858   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1859                                         {ShiftL, ShiftR}, MI.getFlags());
1860 
1861   // Use a shift that will preserve the number of sign bits when the trunc is
1862   // folded away.
1863   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1864                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1865 
1866   MIRBuilder.buildTrunc(DstReg, Result);
1867   MI.eraseFromParent();
1868   return Legalized;
1869 }
1870 
1871 LegalizerHelper::LegalizeResult
1872 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1873                                  LLT WideTy) {
1874   if (TypeIdx == 1)
1875     return UnableToLegalize;
1876 
1877   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1878   Register Result = MI.getOperand(0).getReg();
1879   Register OriginalOverflow = MI.getOperand(1).getReg();
1880   Register LHS = MI.getOperand(2).getReg();
1881   Register RHS = MI.getOperand(3).getReg();
1882   LLT SrcTy = MRI.getType(LHS);
1883   LLT OverflowTy = MRI.getType(OriginalOverflow);
1884   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1885 
1886   // To determine if the result overflowed in the larger type, we extend the
1887   // input to the larger type, do the multiply (checking if it overflows),
1888   // then also check the high bits of the result to see if overflow happened
1889   // there.
1890   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
1891   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
1892   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
1893 
1894   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
1895                                     {LeftOperand, RightOperand});
1896   auto Mul = Mulo->getOperand(0);
1897   MIRBuilder.buildTrunc(Result, Mul);
1898 
1899   MachineInstrBuilder ExtResult;
1900   // Overflow occurred if it occurred in the larger type, or if the high part
1901   // of the result does not zero/sign-extend the low part.  Check this second
1902   // possibility first.
1903   if (IsSigned) {
1904     // For signed, overflow occurred when the high part does not sign-extend
1905     // the low part.
1906     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
1907   } else {
1908     // Unsigned overflow occurred when the high part does not zero-extend the
1909     // low part.
1910     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
1911   }
1912 
1913   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
1914   // so we don't need to check the overflow result of larger type Mulo.
1915   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
1916     auto Overflow =
1917         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
1918     // Finally check if the multiplication in the larger type itself overflowed.
1919     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
1920   } else {
1921     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
1922   }
1923   MI.eraseFromParent();
1924   return Legalized;
1925 }
1926 
1927 LegalizerHelper::LegalizeResult
1928 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
1929   switch (MI.getOpcode()) {
1930   default:
1931     return UnableToLegalize;
1932   case TargetOpcode::G_EXTRACT:
1933     return widenScalarExtract(MI, TypeIdx, WideTy);
1934   case TargetOpcode::G_INSERT:
1935     return widenScalarInsert(MI, TypeIdx, WideTy);
1936   case TargetOpcode::G_MERGE_VALUES:
1937     return widenScalarMergeValues(MI, TypeIdx, WideTy);
1938   case TargetOpcode::G_UNMERGE_VALUES:
1939     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
1940   case TargetOpcode::G_SADDO:
1941   case TargetOpcode::G_SSUBO:
1942   case TargetOpcode::G_UADDO:
1943   case TargetOpcode::G_USUBO:
1944   case TargetOpcode::G_SADDE:
1945   case TargetOpcode::G_SSUBE:
1946   case TargetOpcode::G_UADDE:
1947   case TargetOpcode::G_USUBE:
1948     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
1949   case TargetOpcode::G_UMULO:
1950   case TargetOpcode::G_SMULO:
1951     return widenScalarMulo(MI, TypeIdx, WideTy);
1952   case TargetOpcode::G_SADDSAT:
1953   case TargetOpcode::G_SSUBSAT:
1954   case TargetOpcode::G_SSHLSAT:
1955   case TargetOpcode::G_UADDSAT:
1956   case TargetOpcode::G_USUBSAT:
1957   case TargetOpcode::G_USHLSAT:
1958     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
1959   case TargetOpcode::G_CTTZ:
1960   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1961   case TargetOpcode::G_CTLZ:
1962   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1963   case TargetOpcode::G_CTPOP: {
1964     if (TypeIdx == 0) {
1965       Observer.changingInstr(MI);
1966       widenScalarDst(MI, WideTy, 0);
1967       Observer.changedInstr(MI);
1968       return Legalized;
1969     }
1970 
1971     Register SrcReg = MI.getOperand(1).getReg();
1972 
1973     // First ZEXT the input.
1974     auto MIBSrc = MIRBuilder.buildZExt(WideTy, SrcReg);
1975     LLT CurTy = MRI.getType(SrcReg);
1976     if (MI.getOpcode() == TargetOpcode::G_CTTZ) {
1977       // The count is the same in the larger type except if the original
1978       // value was zero.  This can be handled by setting the bit just off
1979       // the top of the original type.
1980       auto TopBit =
1981           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
1982       MIBSrc = MIRBuilder.buildOr(
1983         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
1984     }
1985 
1986     // Perform the operation at the larger size.
1987     auto MIBNewOp = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy}, {MIBSrc});
1988     // This is already the correct result for CTPOP and CTTZs
1989     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
1990         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
1991       // The correct result is NewOp - (Difference in widety and current ty).
1992       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
1993       MIBNewOp = MIRBuilder.buildSub(
1994           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
1995     }
1996 
1997     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
1998     MI.eraseFromParent();
1999     return Legalized;
2000   }
2001   case TargetOpcode::G_BSWAP: {
2002     Observer.changingInstr(MI);
2003     Register DstReg = MI.getOperand(0).getReg();
2004 
2005     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2006     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2007     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2008     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2009 
2010     MI.getOperand(0).setReg(DstExt);
2011 
2012     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2013 
2014     LLT Ty = MRI.getType(DstReg);
2015     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2016     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2017     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2018 
2019     MIRBuilder.buildTrunc(DstReg, ShrReg);
2020     Observer.changedInstr(MI);
2021     return Legalized;
2022   }
2023   case TargetOpcode::G_BITREVERSE: {
2024     Observer.changingInstr(MI);
2025 
2026     Register DstReg = MI.getOperand(0).getReg();
2027     LLT Ty = MRI.getType(DstReg);
2028     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2029 
2030     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2031     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2032     MI.getOperand(0).setReg(DstExt);
2033     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2034 
2035     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2036     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2037     MIRBuilder.buildTrunc(DstReg, Shift);
2038     Observer.changedInstr(MI);
2039     return Legalized;
2040   }
2041   case TargetOpcode::G_FREEZE:
2042     Observer.changingInstr(MI);
2043     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2044     widenScalarDst(MI, WideTy);
2045     Observer.changedInstr(MI);
2046     return Legalized;
2047 
2048   case TargetOpcode::G_ABS:
2049     Observer.changingInstr(MI);
2050     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2051     widenScalarDst(MI, WideTy);
2052     Observer.changedInstr(MI);
2053     return Legalized;
2054 
2055   case TargetOpcode::G_ADD:
2056   case TargetOpcode::G_AND:
2057   case TargetOpcode::G_MUL:
2058   case TargetOpcode::G_OR:
2059   case TargetOpcode::G_XOR:
2060   case TargetOpcode::G_SUB:
2061     // Perform operation at larger width (any extension is fines here, high bits
2062     // don't affect the result) and then truncate the result back to the
2063     // original type.
2064     Observer.changingInstr(MI);
2065     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2066     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2067     widenScalarDst(MI, WideTy);
2068     Observer.changedInstr(MI);
2069     return Legalized;
2070 
2071   case TargetOpcode::G_SBFX:
2072   case TargetOpcode::G_UBFX:
2073     Observer.changingInstr(MI);
2074 
2075     if (TypeIdx == 0) {
2076       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2077       widenScalarDst(MI, WideTy);
2078     } else {
2079       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2080       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2081     }
2082 
2083     Observer.changedInstr(MI);
2084     return Legalized;
2085 
2086   case TargetOpcode::G_SHL:
2087     Observer.changingInstr(MI);
2088 
2089     if (TypeIdx == 0) {
2090       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2091       widenScalarDst(MI, WideTy);
2092     } else {
2093       assert(TypeIdx == 1);
2094       // The "number of bits to shift" operand must preserve its value as an
2095       // unsigned integer:
2096       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2097     }
2098 
2099     Observer.changedInstr(MI);
2100     return Legalized;
2101 
2102   case TargetOpcode::G_SDIV:
2103   case TargetOpcode::G_SREM:
2104   case TargetOpcode::G_SMIN:
2105   case TargetOpcode::G_SMAX:
2106     Observer.changingInstr(MI);
2107     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2108     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2109     widenScalarDst(MI, WideTy);
2110     Observer.changedInstr(MI);
2111     return Legalized;
2112 
2113   case TargetOpcode::G_SDIVREM:
2114     Observer.changingInstr(MI);
2115     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2116     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2117     widenScalarDst(MI, WideTy);
2118     widenScalarDst(MI, WideTy, 1);
2119     Observer.changedInstr(MI);
2120     return Legalized;
2121 
2122   case TargetOpcode::G_ASHR:
2123   case TargetOpcode::G_LSHR:
2124     Observer.changingInstr(MI);
2125 
2126     if (TypeIdx == 0) {
2127       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2128         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2129 
2130       widenScalarSrc(MI, WideTy, 1, CvtOp);
2131       widenScalarDst(MI, WideTy);
2132     } else {
2133       assert(TypeIdx == 1);
2134       // The "number of bits to shift" operand must preserve its value as an
2135       // unsigned integer:
2136       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2137     }
2138 
2139     Observer.changedInstr(MI);
2140     return Legalized;
2141   case TargetOpcode::G_UDIV:
2142   case TargetOpcode::G_UREM:
2143   case TargetOpcode::G_UMIN:
2144   case TargetOpcode::G_UMAX:
2145     Observer.changingInstr(MI);
2146     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2147     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2148     widenScalarDst(MI, WideTy);
2149     Observer.changedInstr(MI);
2150     return Legalized;
2151 
2152   case TargetOpcode::G_UDIVREM:
2153     Observer.changingInstr(MI);
2154     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2155     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2156     widenScalarDst(MI, WideTy);
2157     widenScalarDst(MI, WideTy, 1);
2158     Observer.changedInstr(MI);
2159     return Legalized;
2160 
2161   case TargetOpcode::G_SELECT:
2162     Observer.changingInstr(MI);
2163     if (TypeIdx == 0) {
2164       // Perform operation at larger width (any extension is fine here, high
2165       // bits don't affect the result) and then truncate the result back to the
2166       // original type.
2167       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2168       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2169       widenScalarDst(MI, WideTy);
2170     } else {
2171       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2172       // Explicit extension is required here since high bits affect the result.
2173       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2174     }
2175     Observer.changedInstr(MI);
2176     return Legalized;
2177 
2178   case TargetOpcode::G_FPTOSI:
2179   case TargetOpcode::G_FPTOUI:
2180     Observer.changingInstr(MI);
2181 
2182     if (TypeIdx == 0)
2183       widenScalarDst(MI, WideTy);
2184     else
2185       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2186 
2187     Observer.changedInstr(MI);
2188     return Legalized;
2189   case TargetOpcode::G_SITOFP:
2190     Observer.changingInstr(MI);
2191 
2192     if (TypeIdx == 0)
2193       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2194     else
2195       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2196 
2197     Observer.changedInstr(MI);
2198     return Legalized;
2199   case TargetOpcode::G_UITOFP:
2200     Observer.changingInstr(MI);
2201 
2202     if (TypeIdx == 0)
2203       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2204     else
2205       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2206 
2207     Observer.changedInstr(MI);
2208     return Legalized;
2209   case TargetOpcode::G_LOAD:
2210   case TargetOpcode::G_SEXTLOAD:
2211   case TargetOpcode::G_ZEXTLOAD:
2212     Observer.changingInstr(MI);
2213     widenScalarDst(MI, WideTy);
2214     Observer.changedInstr(MI);
2215     return Legalized;
2216 
2217   case TargetOpcode::G_STORE: {
2218     if (TypeIdx != 0)
2219       return UnableToLegalize;
2220 
2221     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2222     if (!Ty.isScalar())
2223       return UnableToLegalize;
2224 
2225     Observer.changingInstr(MI);
2226 
2227     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2228       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2229     widenScalarSrc(MI, WideTy, 0, ExtType);
2230 
2231     Observer.changedInstr(MI);
2232     return Legalized;
2233   }
2234   case TargetOpcode::G_CONSTANT: {
2235     MachineOperand &SrcMO = MI.getOperand(1);
2236     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2237     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2238         MRI.getType(MI.getOperand(0).getReg()));
2239     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2240             ExtOpc == TargetOpcode::G_ANYEXT) &&
2241            "Illegal Extend");
2242     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2243     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2244                            ? SrcVal.sext(WideTy.getSizeInBits())
2245                            : SrcVal.zext(WideTy.getSizeInBits());
2246     Observer.changingInstr(MI);
2247     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2248 
2249     widenScalarDst(MI, WideTy);
2250     Observer.changedInstr(MI);
2251     return Legalized;
2252   }
2253   case TargetOpcode::G_FCONSTANT: {
2254     MachineOperand &SrcMO = MI.getOperand(1);
2255     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2256     APFloat Val = SrcMO.getFPImm()->getValueAPF();
2257     bool LosesInfo;
2258     switch (WideTy.getSizeInBits()) {
2259     case 32:
2260       Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
2261                   &LosesInfo);
2262       break;
2263     case 64:
2264       Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
2265                   &LosesInfo);
2266       break;
2267     default:
2268       return UnableToLegalize;
2269     }
2270 
2271     assert(!LosesInfo && "extend should always be lossless");
2272 
2273     Observer.changingInstr(MI);
2274     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
2275 
2276     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2277     Observer.changedInstr(MI);
2278     return Legalized;
2279   }
2280   case TargetOpcode::G_IMPLICIT_DEF: {
2281     Observer.changingInstr(MI);
2282     widenScalarDst(MI, WideTy);
2283     Observer.changedInstr(MI);
2284     return Legalized;
2285   }
2286   case TargetOpcode::G_BRCOND:
2287     Observer.changingInstr(MI);
2288     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2289     Observer.changedInstr(MI);
2290     return Legalized;
2291 
2292   case TargetOpcode::G_FCMP:
2293     Observer.changingInstr(MI);
2294     if (TypeIdx == 0)
2295       widenScalarDst(MI, WideTy);
2296     else {
2297       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2298       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2299     }
2300     Observer.changedInstr(MI);
2301     return Legalized;
2302 
2303   case TargetOpcode::G_ICMP:
2304     Observer.changingInstr(MI);
2305     if (TypeIdx == 0)
2306       widenScalarDst(MI, WideTy);
2307     else {
2308       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2309                                MI.getOperand(1).getPredicate()))
2310                                ? TargetOpcode::G_SEXT
2311                                : TargetOpcode::G_ZEXT;
2312       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2313       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2314     }
2315     Observer.changedInstr(MI);
2316     return Legalized;
2317 
2318   case TargetOpcode::G_PTR_ADD:
2319     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2320     Observer.changingInstr(MI);
2321     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2322     Observer.changedInstr(MI);
2323     return Legalized;
2324 
2325   case TargetOpcode::G_PHI: {
2326     assert(TypeIdx == 0 && "Expecting only Idx 0");
2327 
2328     Observer.changingInstr(MI);
2329     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2330       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2331       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2332       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2333     }
2334 
2335     MachineBasicBlock &MBB = *MI.getParent();
2336     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2337     widenScalarDst(MI, WideTy);
2338     Observer.changedInstr(MI);
2339     return Legalized;
2340   }
2341   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2342     if (TypeIdx == 0) {
2343       Register VecReg = MI.getOperand(1).getReg();
2344       LLT VecTy = MRI.getType(VecReg);
2345       Observer.changingInstr(MI);
2346 
2347       widenScalarSrc(
2348           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2349           TargetOpcode::G_SEXT);
2350 
2351       widenScalarDst(MI, WideTy, 0);
2352       Observer.changedInstr(MI);
2353       return Legalized;
2354     }
2355 
2356     if (TypeIdx != 2)
2357       return UnableToLegalize;
2358     Observer.changingInstr(MI);
2359     // TODO: Probably should be zext
2360     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2361     Observer.changedInstr(MI);
2362     return Legalized;
2363   }
2364   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2365     if (TypeIdx == 1) {
2366       Observer.changingInstr(MI);
2367 
2368       Register VecReg = MI.getOperand(1).getReg();
2369       LLT VecTy = MRI.getType(VecReg);
2370       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2371 
2372       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2373       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2374       widenScalarDst(MI, WideVecTy, 0);
2375       Observer.changedInstr(MI);
2376       return Legalized;
2377     }
2378 
2379     if (TypeIdx == 2) {
2380       Observer.changingInstr(MI);
2381       // TODO: Probably should be zext
2382       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2383       Observer.changedInstr(MI);
2384       return Legalized;
2385     }
2386 
2387     return UnableToLegalize;
2388   }
2389   case TargetOpcode::G_FADD:
2390   case TargetOpcode::G_FMUL:
2391   case TargetOpcode::G_FSUB:
2392   case TargetOpcode::G_FMA:
2393   case TargetOpcode::G_FMAD:
2394   case TargetOpcode::G_FNEG:
2395   case TargetOpcode::G_FABS:
2396   case TargetOpcode::G_FCANONICALIZE:
2397   case TargetOpcode::G_FMINNUM:
2398   case TargetOpcode::G_FMAXNUM:
2399   case TargetOpcode::G_FMINNUM_IEEE:
2400   case TargetOpcode::G_FMAXNUM_IEEE:
2401   case TargetOpcode::G_FMINIMUM:
2402   case TargetOpcode::G_FMAXIMUM:
2403   case TargetOpcode::G_FDIV:
2404   case TargetOpcode::G_FREM:
2405   case TargetOpcode::G_FCEIL:
2406   case TargetOpcode::G_FFLOOR:
2407   case TargetOpcode::G_FCOS:
2408   case TargetOpcode::G_FSIN:
2409   case TargetOpcode::G_FLOG10:
2410   case TargetOpcode::G_FLOG:
2411   case TargetOpcode::G_FLOG2:
2412   case TargetOpcode::G_FRINT:
2413   case TargetOpcode::G_FNEARBYINT:
2414   case TargetOpcode::G_FSQRT:
2415   case TargetOpcode::G_FEXP:
2416   case TargetOpcode::G_FEXP2:
2417   case TargetOpcode::G_FPOW:
2418   case TargetOpcode::G_INTRINSIC_TRUNC:
2419   case TargetOpcode::G_INTRINSIC_ROUND:
2420   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2421     assert(TypeIdx == 0);
2422     Observer.changingInstr(MI);
2423 
2424     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2425       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2426 
2427     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2428     Observer.changedInstr(MI);
2429     return Legalized;
2430   case TargetOpcode::G_FPOWI: {
2431     if (TypeIdx != 0)
2432       return UnableToLegalize;
2433     Observer.changingInstr(MI);
2434     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2435     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2436     Observer.changedInstr(MI);
2437     return Legalized;
2438   }
2439   case TargetOpcode::G_INTTOPTR:
2440     if (TypeIdx != 1)
2441       return UnableToLegalize;
2442 
2443     Observer.changingInstr(MI);
2444     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2445     Observer.changedInstr(MI);
2446     return Legalized;
2447   case TargetOpcode::G_PTRTOINT:
2448     if (TypeIdx != 0)
2449       return UnableToLegalize;
2450 
2451     Observer.changingInstr(MI);
2452     widenScalarDst(MI, WideTy, 0);
2453     Observer.changedInstr(MI);
2454     return Legalized;
2455   case TargetOpcode::G_BUILD_VECTOR: {
2456     Observer.changingInstr(MI);
2457 
2458     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2459     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2460       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2461 
2462     // Avoid changing the result vector type if the source element type was
2463     // requested.
2464     if (TypeIdx == 1) {
2465       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2466     } else {
2467       widenScalarDst(MI, WideTy, 0);
2468     }
2469 
2470     Observer.changedInstr(MI);
2471     return Legalized;
2472   }
2473   case TargetOpcode::G_SEXT_INREG:
2474     if (TypeIdx != 0)
2475       return UnableToLegalize;
2476 
2477     Observer.changingInstr(MI);
2478     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2479     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2480     Observer.changedInstr(MI);
2481     return Legalized;
2482   case TargetOpcode::G_PTRMASK: {
2483     if (TypeIdx != 1)
2484       return UnableToLegalize;
2485     Observer.changingInstr(MI);
2486     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2487     Observer.changedInstr(MI);
2488     return Legalized;
2489   }
2490   }
2491 }
2492 
2493 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2494                              MachineIRBuilder &B, Register Src, LLT Ty) {
2495   auto Unmerge = B.buildUnmerge(Ty, Src);
2496   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2497     Pieces.push_back(Unmerge.getReg(I));
2498 }
2499 
2500 LegalizerHelper::LegalizeResult
2501 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2502   Register Dst = MI.getOperand(0).getReg();
2503   Register Src = MI.getOperand(1).getReg();
2504   LLT DstTy = MRI.getType(Dst);
2505   LLT SrcTy = MRI.getType(Src);
2506 
2507   if (SrcTy.isVector()) {
2508     LLT SrcEltTy = SrcTy.getElementType();
2509     SmallVector<Register, 8> SrcRegs;
2510 
2511     if (DstTy.isVector()) {
2512       int NumDstElt = DstTy.getNumElements();
2513       int NumSrcElt = SrcTy.getNumElements();
2514 
2515       LLT DstEltTy = DstTy.getElementType();
2516       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2517       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2518 
2519       // If there's an element size mismatch, insert intermediate casts to match
2520       // the result element type.
2521       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2522         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2523         //
2524         // =>
2525         //
2526         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2527         // %3:_(<2 x s8>) = G_BITCAST %2
2528         // %4:_(<2 x s8>) = G_BITCAST %3
2529         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2530         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2531         SrcPartTy = SrcEltTy;
2532       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2533         //
2534         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2535         //
2536         // =>
2537         //
2538         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2539         // %3:_(s16) = G_BITCAST %2
2540         // %4:_(s16) = G_BITCAST %3
2541         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2542         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2543         DstCastTy = DstEltTy;
2544       }
2545 
2546       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2547       for (Register &SrcReg : SrcRegs)
2548         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2549     } else
2550       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2551 
2552     MIRBuilder.buildMerge(Dst, SrcRegs);
2553     MI.eraseFromParent();
2554     return Legalized;
2555   }
2556 
2557   if (DstTy.isVector()) {
2558     SmallVector<Register, 8> SrcRegs;
2559     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2560     MIRBuilder.buildMerge(Dst, SrcRegs);
2561     MI.eraseFromParent();
2562     return Legalized;
2563   }
2564 
2565   return UnableToLegalize;
2566 }
2567 
2568 /// Figure out the bit offset into a register when coercing a vector index for
2569 /// the wide element type. This is only for the case when promoting vector to
2570 /// one with larger elements.
2571 //
2572 ///
2573 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2574 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2575 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2576                                                    Register Idx,
2577                                                    unsigned NewEltSize,
2578                                                    unsigned OldEltSize) {
2579   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2580   LLT IdxTy = B.getMRI()->getType(Idx);
2581 
2582   // Now figure out the amount we need to shift to get the target bits.
2583   auto OffsetMask = B.buildConstant(
2584     IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
2585   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2586   return B.buildShl(IdxTy, OffsetIdx,
2587                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2588 }
2589 
2590 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2591 /// is casting to a vector with a smaller element size, perform multiple element
2592 /// extracts and merge the results. If this is coercing to a vector with larger
2593 /// elements, index the bitcasted vector and extract the target element with bit
2594 /// operations. This is intended to force the indexing in the native register
2595 /// size for architectures that can dynamically index the register file.
2596 LegalizerHelper::LegalizeResult
2597 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2598                                          LLT CastTy) {
2599   if (TypeIdx != 1)
2600     return UnableToLegalize;
2601 
2602   Register Dst = MI.getOperand(0).getReg();
2603   Register SrcVec = MI.getOperand(1).getReg();
2604   Register Idx = MI.getOperand(2).getReg();
2605   LLT SrcVecTy = MRI.getType(SrcVec);
2606   LLT IdxTy = MRI.getType(Idx);
2607 
2608   LLT SrcEltTy = SrcVecTy.getElementType();
2609   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2610   unsigned OldNumElts = SrcVecTy.getNumElements();
2611 
2612   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2613   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2614 
2615   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2616   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2617   if (NewNumElts > OldNumElts) {
2618     // Decreasing the vector element size
2619     //
2620     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2621     //  =>
2622     //  v4i32:castx = bitcast x:v2i64
2623     //
2624     // i64 = bitcast
2625     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2626     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2627     //
2628     if (NewNumElts % OldNumElts != 0)
2629       return UnableToLegalize;
2630 
2631     // Type of the intermediate result vector.
2632     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2633     LLT MidTy =
2634         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2635 
2636     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2637 
2638     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2639     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2640 
2641     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2642       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2643       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2644       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2645       NewOps[I] = Elt.getReg(0);
2646     }
2647 
2648     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2649     MIRBuilder.buildBitcast(Dst, NewVec);
2650     MI.eraseFromParent();
2651     return Legalized;
2652   }
2653 
2654   if (NewNumElts < OldNumElts) {
2655     if (NewEltSize % OldEltSize != 0)
2656       return UnableToLegalize;
2657 
2658     // This only depends on powers of 2 because we use bit tricks to figure out
2659     // the bit offset we need to shift to get the target element. A general
2660     // expansion could emit division/multiply.
2661     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2662       return UnableToLegalize;
2663 
2664     // Increasing the vector element size.
2665     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2666     //
2667     //   =>
2668     //
2669     // %cast = G_BITCAST %vec
2670     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2671     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2672     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2673     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2674     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2675     // %elt = G_TRUNC %elt_bits
2676 
2677     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2678     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2679 
2680     // Divide to get the index in the wider element type.
2681     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2682 
2683     Register WideElt = CastVec;
2684     if (CastTy.isVector()) {
2685       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2686                                                      ScaledIdx).getReg(0);
2687     }
2688 
2689     // Compute the bit offset into the register of the target element.
2690     Register OffsetBits = getBitcastWiderVectorElementOffset(
2691       MIRBuilder, Idx, NewEltSize, OldEltSize);
2692 
2693     // Shift the wide element to get the target element.
2694     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2695     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2696     MI.eraseFromParent();
2697     return Legalized;
2698   }
2699 
2700   return UnableToLegalize;
2701 }
2702 
2703 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2704 /// TargetReg, while preserving other bits in \p TargetReg.
2705 ///
2706 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2707 static Register buildBitFieldInsert(MachineIRBuilder &B,
2708                                     Register TargetReg, Register InsertReg,
2709                                     Register OffsetBits) {
2710   LLT TargetTy = B.getMRI()->getType(TargetReg);
2711   LLT InsertTy = B.getMRI()->getType(InsertReg);
2712   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2713   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2714 
2715   // Produce a bitmask of the value to insert
2716   auto EltMask = B.buildConstant(
2717     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2718                                    InsertTy.getSizeInBits()));
2719   // Shift it into position
2720   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2721   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2722 
2723   // Clear out the bits in the wide element
2724   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2725 
2726   // The value to insert has all zeros already, so stick it into the masked
2727   // wide element.
2728   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2729 }
2730 
2731 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2732 /// is increasing the element size, perform the indexing in the target element
2733 /// type, and use bit operations to insert at the element position. This is
2734 /// intended for architectures that can dynamically index the register file and
2735 /// want to force indexing in the native register size.
2736 LegalizerHelper::LegalizeResult
2737 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2738                                         LLT CastTy) {
2739   if (TypeIdx != 0)
2740     return UnableToLegalize;
2741 
2742   Register Dst = MI.getOperand(0).getReg();
2743   Register SrcVec = MI.getOperand(1).getReg();
2744   Register Val = MI.getOperand(2).getReg();
2745   Register Idx = MI.getOperand(3).getReg();
2746 
2747   LLT VecTy = MRI.getType(Dst);
2748   LLT IdxTy = MRI.getType(Idx);
2749 
2750   LLT VecEltTy = VecTy.getElementType();
2751   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2752   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2753   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2754 
2755   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2756   unsigned OldNumElts = VecTy.getNumElements();
2757 
2758   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2759   if (NewNumElts < OldNumElts) {
2760     if (NewEltSize % OldEltSize != 0)
2761       return UnableToLegalize;
2762 
2763     // This only depends on powers of 2 because we use bit tricks to figure out
2764     // the bit offset we need to shift to get the target element. A general
2765     // expansion could emit division/multiply.
2766     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2767       return UnableToLegalize;
2768 
2769     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2770     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2771 
2772     // Divide to get the index in the wider element type.
2773     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2774 
2775     Register ExtractedElt = CastVec;
2776     if (CastTy.isVector()) {
2777       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2778                                                           ScaledIdx).getReg(0);
2779     }
2780 
2781     // Compute the bit offset into the register of the target element.
2782     Register OffsetBits = getBitcastWiderVectorElementOffset(
2783       MIRBuilder, Idx, NewEltSize, OldEltSize);
2784 
2785     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2786                                                Val, OffsetBits);
2787     if (CastTy.isVector()) {
2788       InsertedElt = MIRBuilder.buildInsertVectorElement(
2789         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2790     }
2791 
2792     MIRBuilder.buildBitcast(Dst, InsertedElt);
2793     MI.eraseFromParent();
2794     return Legalized;
2795   }
2796 
2797   return UnableToLegalize;
2798 }
2799 
2800 LegalizerHelper::LegalizeResult
2801 LegalizerHelper::lowerLoad(MachineInstr &MI) {
2802   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2803   Register DstReg = MI.getOperand(0).getReg();
2804   Register PtrReg = MI.getOperand(1).getReg();
2805   LLT DstTy = MRI.getType(DstReg);
2806   MachineMemOperand &MMO = **MI.memoperands_begin();
2807   LLT MemTy = MMO.getMemoryType();
2808   MachineFunction &MF = MIRBuilder.getMF();
2809   if (MemTy.isVector())
2810     return UnableToLegalize;
2811 
2812   unsigned MemSizeInBits = MemTy.getSizeInBits();
2813   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2814 
2815   if (MemSizeInBits != MemStoreSizeInBits) {
2816     // Promote to a byte-sized load if not loading an integral number of
2817     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2818     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2819     MachineMemOperand *NewMMO =
2820         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2821 
2822     Register LoadReg = DstReg;
2823     LLT LoadTy = DstTy;
2824 
2825     // If this wasn't already an extending load, we need to widen the result
2826     // register to avoid creating a load with a narrower result than the source.
2827     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2828       LoadTy = WideMemTy;
2829       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2830     }
2831 
2832     if (MI.getOpcode() == TargetOpcode::G_SEXTLOAD) {
2833       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2834       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2835     } else if (MI.getOpcode() == TargetOpcode::G_ZEXTLOAD ||
2836                WideMemTy == DstTy) {
2837       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2838       // The extra bits are guaranteed to be zero, since we stored them that
2839       // way.  A zext load from Wide thus automatically gives zext from MemVT.
2840       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
2841     } else {
2842       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
2843     }
2844 
2845     if (DstTy != LoadTy)
2846       MIRBuilder.buildTrunc(DstReg, LoadReg);
2847 
2848     MI.eraseFromParent();
2849     return Legalized;
2850   }
2851 
2852   if (DstTy.getSizeInBits() != MMO.getSizeInBits())
2853     return UnableToLegalize;
2854 
2855   if (MI.getOpcode() == TargetOpcode::G_LOAD) {
2856     // This load needs splitting into power of 2 sized loads.
2857     if (DstTy.isVector())
2858       return UnableToLegalize;
2859     if (isPowerOf2_32(DstTy.getSizeInBits()))
2860       return UnableToLegalize; // Don't know what we're being asked to do.
2861 
2862     // Our strategy here is to generate anyextending loads for the smaller
2863     // types up to next power-2 result type, and then combine the two larger
2864     // result values together, before truncating back down to the non-pow-2
2865     // type.
2866     // E.g. v1 = i24 load =>
2867     // v2 = i32 zextload (2 byte)
2868     // v3 = i32 load (1 byte)
2869     // v4 = i32 shl v3, 16
2870     // v5 = i32 or v4, v2
2871     // v1 = i24 trunc v5
2872     // By doing this we generate the correct truncate which should get
2873     // combined away as an artifact with a matching extend.
2874     uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits());
2875     uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize;
2876 
2877     MachineFunction &MF = MIRBuilder.getMF();
2878     MachineMemOperand *LargeMMO =
2879       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
2880     MachineMemOperand *SmallMMO = MF.getMachineMemOperand(
2881       &MMO, LargeSplitSize / 8, SmallSplitSize / 8);
2882 
2883     LLT PtrTy = MRI.getType(PtrReg);
2884     unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits());
2885     LLT AnyExtTy = LLT::scalar(AnyExtSize);
2886     auto LargeLoad = MIRBuilder.buildLoadInstr(
2887       TargetOpcode::G_ZEXTLOAD, AnyExtTy, PtrReg, *LargeMMO);
2888 
2889     auto OffsetCst = MIRBuilder.buildConstant(
2890       LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
2891     Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
2892     auto SmallPtr =
2893       MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
2894     auto SmallLoad = MIRBuilder.buildLoad(AnyExtTy, SmallPtr,
2895                                           *SmallMMO);
2896 
2897     auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
2898     auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
2899     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
2900     MIRBuilder.buildTrunc(DstReg, {Or});
2901     MI.eraseFromParent();
2902     return Legalized;
2903   }
2904 
2905   return UnableToLegalize;
2906 }
2907 
2908 LegalizerHelper::LegalizeResult
2909 LegalizerHelper::lowerStore(MachineInstr &MI) {
2910   // Lower a non-power of 2 store into multiple pow-2 stores.
2911   // E.g. split an i24 store into an i16 store + i8 store.
2912   // We do this by first extending the stored value to the next largest power
2913   // of 2 type, and then using truncating stores to store the components.
2914   // By doing this, likewise with G_LOAD, generate an extend that can be
2915   // artifact-combined away instead of leaving behind extracts.
2916   Register SrcReg = MI.getOperand(0).getReg();
2917   Register PtrReg = MI.getOperand(1).getReg();
2918   LLT SrcTy = MRI.getType(SrcReg);
2919   MachineFunction &MF = MIRBuilder.getMF();
2920   MachineMemOperand &MMO = **MI.memoperands_begin();
2921   LLT MemTy = MMO.getMemoryType();
2922 
2923   if (SrcTy.isVector())
2924     return UnableToLegalize;
2925 
2926   unsigned StoreWidth = MemTy.getSizeInBits();
2927   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
2928 
2929   if (StoreWidth != StoreSizeInBits) {
2930     // Promote to a byte-sized store with upper bits zero if not
2931     // storing an integral number of bytes.  For example, promote
2932     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
2933     LLT WideTy = LLT::scalar(StoreSizeInBits);
2934 
2935     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
2936       // Avoid creating a store with a narrower source than result.
2937       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
2938       SrcTy = WideTy;
2939     }
2940 
2941     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
2942 
2943     MachineMemOperand *NewMMO =
2944         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
2945     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
2946     MI.eraseFromParent();
2947     return Legalized;
2948   }
2949 
2950   if (isPowerOf2_32(MemTy.getSizeInBits()))
2951     return UnableToLegalize; // Don't know what we're being asked to do.
2952 
2953   // Extend to the next pow-2.
2954   const LLT ExtendTy = LLT::scalar(NextPowerOf2(MemTy.getSizeInBits()));
2955   auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg);
2956 
2957   // Obtain the smaller value by shifting away the larger value.
2958   uint64_t LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
2959   uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize;
2960   auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize);
2961   auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt);
2962 
2963   // Generate the PtrAdd and truncating stores.
2964   LLT PtrTy = MRI.getType(PtrReg);
2965   auto OffsetCst = MIRBuilder.buildConstant(
2966     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
2967   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
2968   auto SmallPtr =
2969     MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
2970 
2971   MachineMemOperand *LargeMMO =
2972     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
2973   MachineMemOperand *SmallMMO =
2974     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
2975   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
2976   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
2977   MI.eraseFromParent();
2978   return Legalized;
2979 }
2980 
2981 LegalizerHelper::LegalizeResult
2982 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
2983   switch (MI.getOpcode()) {
2984   case TargetOpcode::G_LOAD: {
2985     if (TypeIdx != 0)
2986       return UnableToLegalize;
2987 
2988     Observer.changingInstr(MI);
2989     bitcastDst(MI, CastTy, 0);
2990     Observer.changedInstr(MI);
2991     return Legalized;
2992   }
2993   case TargetOpcode::G_STORE: {
2994     if (TypeIdx != 0)
2995       return UnableToLegalize;
2996 
2997     Observer.changingInstr(MI);
2998     bitcastSrc(MI, CastTy, 0);
2999     Observer.changedInstr(MI);
3000     return Legalized;
3001   }
3002   case TargetOpcode::G_SELECT: {
3003     if (TypeIdx != 0)
3004       return UnableToLegalize;
3005 
3006     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3007       LLVM_DEBUG(
3008           dbgs() << "bitcast action not implemented for vector select\n");
3009       return UnableToLegalize;
3010     }
3011 
3012     Observer.changingInstr(MI);
3013     bitcastSrc(MI, CastTy, 2);
3014     bitcastSrc(MI, CastTy, 3);
3015     bitcastDst(MI, CastTy, 0);
3016     Observer.changedInstr(MI);
3017     return Legalized;
3018   }
3019   case TargetOpcode::G_AND:
3020   case TargetOpcode::G_OR:
3021   case TargetOpcode::G_XOR: {
3022     Observer.changingInstr(MI);
3023     bitcastSrc(MI, CastTy, 1);
3024     bitcastSrc(MI, CastTy, 2);
3025     bitcastDst(MI, CastTy, 0);
3026     Observer.changedInstr(MI);
3027     return Legalized;
3028   }
3029   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3030     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3031   case TargetOpcode::G_INSERT_VECTOR_ELT:
3032     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3033   default:
3034     return UnableToLegalize;
3035   }
3036 }
3037 
3038 // Legalize an instruction by changing the opcode in place.
3039 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3040     Observer.changingInstr(MI);
3041     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3042     Observer.changedInstr(MI);
3043 }
3044 
3045 LegalizerHelper::LegalizeResult
3046 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3047   using namespace TargetOpcode;
3048 
3049   switch(MI.getOpcode()) {
3050   default:
3051     return UnableToLegalize;
3052   case TargetOpcode::G_BITCAST:
3053     return lowerBitcast(MI);
3054   case TargetOpcode::G_SREM:
3055   case TargetOpcode::G_UREM: {
3056     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3057     auto Quot =
3058         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3059                               {MI.getOperand(1), MI.getOperand(2)});
3060 
3061     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3062     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3063     MI.eraseFromParent();
3064     return Legalized;
3065   }
3066   case TargetOpcode::G_SADDO:
3067   case TargetOpcode::G_SSUBO:
3068     return lowerSADDO_SSUBO(MI);
3069   case TargetOpcode::G_UMULH:
3070   case TargetOpcode::G_SMULH:
3071     return lowerSMULH_UMULH(MI);
3072   case TargetOpcode::G_SMULO:
3073   case TargetOpcode::G_UMULO: {
3074     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3075     // result.
3076     Register Res = MI.getOperand(0).getReg();
3077     Register Overflow = MI.getOperand(1).getReg();
3078     Register LHS = MI.getOperand(2).getReg();
3079     Register RHS = MI.getOperand(3).getReg();
3080     LLT Ty = MRI.getType(Res);
3081 
3082     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3083                           ? TargetOpcode::G_SMULH
3084                           : TargetOpcode::G_UMULH;
3085 
3086     Observer.changingInstr(MI);
3087     const auto &TII = MIRBuilder.getTII();
3088     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3089     MI.RemoveOperand(1);
3090     Observer.changedInstr(MI);
3091 
3092     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3093     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3094 
3095     // Move insert point forward so we can use the Res register if needed.
3096     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3097 
3098     // For *signed* multiply, overflow is detected by checking:
3099     // (hi != (lo >> bitwidth-1))
3100     if (Opcode == TargetOpcode::G_SMULH) {
3101       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3102       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3103       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3104     } else {
3105       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3106     }
3107     return Legalized;
3108   }
3109   case TargetOpcode::G_FNEG: {
3110     Register Res = MI.getOperand(0).getReg();
3111     LLT Ty = MRI.getType(Res);
3112 
3113     // TODO: Handle vector types once we are able to
3114     // represent them.
3115     if (Ty.isVector())
3116       return UnableToLegalize;
3117     auto SignMask =
3118         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3119     Register SubByReg = MI.getOperand(1).getReg();
3120     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3121     MI.eraseFromParent();
3122     return Legalized;
3123   }
3124   case TargetOpcode::G_FSUB: {
3125     Register Res = MI.getOperand(0).getReg();
3126     LLT Ty = MRI.getType(Res);
3127 
3128     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3129     // First, check if G_FNEG is marked as Lower. If so, we may
3130     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3131     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3132       return UnableToLegalize;
3133     Register LHS = MI.getOperand(1).getReg();
3134     Register RHS = MI.getOperand(2).getReg();
3135     Register Neg = MRI.createGenericVirtualRegister(Ty);
3136     MIRBuilder.buildFNeg(Neg, RHS);
3137     MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3138     MI.eraseFromParent();
3139     return Legalized;
3140   }
3141   case TargetOpcode::G_FMAD:
3142     return lowerFMad(MI);
3143   case TargetOpcode::G_FFLOOR:
3144     return lowerFFloor(MI);
3145   case TargetOpcode::G_INTRINSIC_ROUND:
3146     return lowerIntrinsicRound(MI);
3147   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3148     // Since round even is the assumed rounding mode for unconstrained FP
3149     // operations, rint and roundeven are the same operation.
3150     changeOpcode(MI, TargetOpcode::G_FRINT);
3151     return Legalized;
3152   }
3153   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3154     Register OldValRes = MI.getOperand(0).getReg();
3155     Register SuccessRes = MI.getOperand(1).getReg();
3156     Register Addr = MI.getOperand(2).getReg();
3157     Register CmpVal = MI.getOperand(3).getReg();
3158     Register NewVal = MI.getOperand(4).getReg();
3159     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3160                                   **MI.memoperands_begin());
3161     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3162     MI.eraseFromParent();
3163     return Legalized;
3164   }
3165   case TargetOpcode::G_LOAD:
3166   case TargetOpcode::G_SEXTLOAD:
3167   case TargetOpcode::G_ZEXTLOAD:
3168     return lowerLoad(MI);
3169   case TargetOpcode::G_STORE:
3170     return lowerStore(MI);
3171   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3172   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3173   case TargetOpcode::G_CTLZ:
3174   case TargetOpcode::G_CTTZ:
3175   case TargetOpcode::G_CTPOP:
3176     return lowerBitCount(MI);
3177   case G_UADDO: {
3178     Register Res = MI.getOperand(0).getReg();
3179     Register CarryOut = MI.getOperand(1).getReg();
3180     Register LHS = MI.getOperand(2).getReg();
3181     Register RHS = MI.getOperand(3).getReg();
3182 
3183     MIRBuilder.buildAdd(Res, LHS, RHS);
3184     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3185 
3186     MI.eraseFromParent();
3187     return Legalized;
3188   }
3189   case G_UADDE: {
3190     Register Res = MI.getOperand(0).getReg();
3191     Register CarryOut = MI.getOperand(1).getReg();
3192     Register LHS = MI.getOperand(2).getReg();
3193     Register RHS = MI.getOperand(3).getReg();
3194     Register CarryIn = MI.getOperand(4).getReg();
3195     LLT Ty = MRI.getType(Res);
3196 
3197     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3198     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3199     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3200     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3201 
3202     MI.eraseFromParent();
3203     return Legalized;
3204   }
3205   case G_USUBO: {
3206     Register Res = MI.getOperand(0).getReg();
3207     Register BorrowOut = MI.getOperand(1).getReg();
3208     Register LHS = MI.getOperand(2).getReg();
3209     Register RHS = MI.getOperand(3).getReg();
3210 
3211     MIRBuilder.buildSub(Res, LHS, RHS);
3212     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3213 
3214     MI.eraseFromParent();
3215     return Legalized;
3216   }
3217   case G_USUBE: {
3218     Register Res = MI.getOperand(0).getReg();
3219     Register BorrowOut = MI.getOperand(1).getReg();
3220     Register LHS = MI.getOperand(2).getReg();
3221     Register RHS = MI.getOperand(3).getReg();
3222     Register BorrowIn = MI.getOperand(4).getReg();
3223     const LLT CondTy = MRI.getType(BorrowOut);
3224     const LLT Ty = MRI.getType(Res);
3225 
3226     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3227     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3228     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3229 
3230     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3231     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3232     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3233 
3234     MI.eraseFromParent();
3235     return Legalized;
3236   }
3237   case G_UITOFP:
3238     return lowerUITOFP(MI);
3239   case G_SITOFP:
3240     return lowerSITOFP(MI);
3241   case G_FPTOUI:
3242     return lowerFPTOUI(MI);
3243   case G_FPTOSI:
3244     return lowerFPTOSI(MI);
3245   case G_FPTRUNC:
3246     return lowerFPTRUNC(MI);
3247   case G_FPOWI:
3248     return lowerFPOWI(MI);
3249   case G_SMIN:
3250   case G_SMAX:
3251   case G_UMIN:
3252   case G_UMAX:
3253     return lowerMinMax(MI);
3254   case G_FCOPYSIGN:
3255     return lowerFCopySign(MI);
3256   case G_FMINNUM:
3257   case G_FMAXNUM:
3258     return lowerFMinNumMaxNum(MI);
3259   case G_MERGE_VALUES:
3260     return lowerMergeValues(MI);
3261   case G_UNMERGE_VALUES:
3262     return lowerUnmergeValues(MI);
3263   case TargetOpcode::G_SEXT_INREG: {
3264     assert(MI.getOperand(2).isImm() && "Expected immediate");
3265     int64_t SizeInBits = MI.getOperand(2).getImm();
3266 
3267     Register DstReg = MI.getOperand(0).getReg();
3268     Register SrcReg = MI.getOperand(1).getReg();
3269     LLT DstTy = MRI.getType(DstReg);
3270     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3271 
3272     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3273     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3274     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3275     MI.eraseFromParent();
3276     return Legalized;
3277   }
3278   case G_EXTRACT_VECTOR_ELT:
3279   case G_INSERT_VECTOR_ELT:
3280     return lowerExtractInsertVectorElt(MI);
3281   case G_SHUFFLE_VECTOR:
3282     return lowerShuffleVector(MI);
3283   case G_DYN_STACKALLOC:
3284     return lowerDynStackAlloc(MI);
3285   case G_EXTRACT:
3286     return lowerExtract(MI);
3287   case G_INSERT:
3288     return lowerInsert(MI);
3289   case G_BSWAP:
3290     return lowerBswap(MI);
3291   case G_BITREVERSE:
3292     return lowerBitreverse(MI);
3293   case G_READ_REGISTER:
3294   case G_WRITE_REGISTER:
3295     return lowerReadWriteRegister(MI);
3296   case G_UADDSAT:
3297   case G_USUBSAT: {
3298     // Try to make a reasonable guess about which lowering strategy to use. The
3299     // target can override this with custom lowering and calling the
3300     // implementation functions.
3301     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3302     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3303       return lowerAddSubSatToMinMax(MI);
3304     return lowerAddSubSatToAddoSubo(MI);
3305   }
3306   case G_SADDSAT:
3307   case G_SSUBSAT: {
3308     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3309 
3310     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3311     // since it's a shorter expansion. However, we would need to figure out the
3312     // preferred boolean type for the carry out for the query.
3313     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3314       return lowerAddSubSatToMinMax(MI);
3315     return lowerAddSubSatToAddoSubo(MI);
3316   }
3317   case G_SSHLSAT:
3318   case G_USHLSAT:
3319     return lowerShlSat(MI);
3320   case G_ABS:
3321     return lowerAbsToAddXor(MI);
3322   case G_SELECT:
3323     return lowerSelect(MI);
3324   case G_SDIVREM:
3325   case G_UDIVREM:
3326     return lowerDIVREM(MI);
3327   case G_FSHL:
3328   case G_FSHR:
3329     return lowerFunnelShift(MI);
3330   case G_ROTL:
3331   case G_ROTR:
3332     return lowerRotate(MI);
3333   }
3334 }
3335 
3336 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3337                                                   Align MinAlign) const {
3338   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3339   // datalayout for the preferred alignment. Also there should be a target hook
3340   // for this to allow targets to reduce the alignment and ignore the
3341   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3342   // the type.
3343   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3344 }
3345 
3346 MachineInstrBuilder
3347 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3348                                       MachinePointerInfo &PtrInfo) {
3349   MachineFunction &MF = MIRBuilder.getMF();
3350   const DataLayout &DL = MIRBuilder.getDataLayout();
3351   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3352 
3353   unsigned AddrSpace = DL.getAllocaAddrSpace();
3354   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3355 
3356   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3357   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3358 }
3359 
3360 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3361                                         LLT VecTy) {
3362   int64_t IdxVal;
3363   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3364     return IdxReg;
3365 
3366   LLT IdxTy = B.getMRI()->getType(IdxReg);
3367   unsigned NElts = VecTy.getNumElements();
3368   if (isPowerOf2_32(NElts)) {
3369     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3370     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3371   }
3372 
3373   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3374       .getReg(0);
3375 }
3376 
3377 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3378                                                   Register Index) {
3379   LLT EltTy = VecTy.getElementType();
3380 
3381   // Calculate the element offset and add it to the pointer.
3382   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3383   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3384          "Converting bits to bytes lost precision");
3385 
3386   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3387 
3388   LLT IdxTy = MRI.getType(Index);
3389   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3390                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3391 
3392   LLT PtrTy = MRI.getType(VecPtr);
3393   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3394 }
3395 
3396 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef(
3397     MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) {
3398   Register DstReg = MI.getOperand(0).getReg();
3399   LLT DstTy = MRI.getType(DstReg);
3400   LLT LCMTy = getLCMType(DstTy, NarrowTy);
3401 
3402   unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
3403 
3404   auto NewUndef = MIRBuilder.buildUndef(NarrowTy);
3405   SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0));
3406 
3407   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
3408   MI.eraseFromParent();
3409   return Legalized;
3410 }
3411 
3412 // Handle splitting vector operations which need to have the same number of
3413 // elements in each type index, but each type index may have a different element
3414 // type.
3415 //
3416 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3417 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3418 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3419 //
3420 // Also handles some irregular breakdown cases, e.g.
3421 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3422 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3423 //             s64 = G_SHL s64, s32
3424 LegalizerHelper::LegalizeResult
3425 LegalizerHelper::fewerElementsVectorMultiEltType(
3426   MachineInstr &MI, unsigned TypeIdx, LLT NarrowTyArg) {
3427   if (TypeIdx != 0)
3428     return UnableToLegalize;
3429 
3430   const LLT NarrowTy0 = NarrowTyArg;
3431   const Register DstReg = MI.getOperand(0).getReg();
3432   LLT DstTy = MRI.getType(DstReg);
3433   LLT LeftoverTy0;
3434 
3435   // All of the operands need to have the same number of elements, so if we can
3436   // determine a type breakdown for the result type, we can for all of the
3437   // source types.
3438   int NumParts = getNarrowTypeBreakDown(DstTy, NarrowTy0, LeftoverTy0).first;
3439   if (NumParts < 0)
3440     return UnableToLegalize;
3441 
3442   SmallVector<MachineInstrBuilder, 4> NewInsts;
3443 
3444   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3445   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3446 
3447   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
3448     Register SrcReg = MI.getOperand(I).getReg();
3449     LLT SrcTyI = MRI.getType(SrcReg);
3450     const auto NewEC = NarrowTy0.isVector() ? NarrowTy0.getElementCount()
3451                                             : ElementCount::getFixed(1);
3452     LLT NarrowTyI = LLT::scalarOrVector(NewEC, SrcTyI.getScalarType());
3453     LLT LeftoverTyI;
3454 
3455     // Split this operand into the requested typed registers, and any leftover
3456     // required to reproduce the original type.
3457     if (!extractParts(SrcReg, SrcTyI, NarrowTyI, LeftoverTyI, PartRegs,
3458                       LeftoverRegs))
3459       return UnableToLegalize;
3460 
3461     if (I == 1) {
3462       // For the first operand, create an instruction for each part and setup
3463       // the result.
3464       for (Register PartReg : PartRegs) {
3465         Register PartDstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3466         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3467                                .addDef(PartDstReg)
3468                                .addUse(PartReg));
3469         DstRegs.push_back(PartDstReg);
3470       }
3471 
3472       for (Register LeftoverReg : LeftoverRegs) {
3473         Register PartDstReg = MRI.createGenericVirtualRegister(LeftoverTy0);
3474         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3475                                .addDef(PartDstReg)
3476                                .addUse(LeftoverReg));
3477         LeftoverDstRegs.push_back(PartDstReg);
3478       }
3479     } else {
3480       assert(NewInsts.size() == PartRegs.size() + LeftoverRegs.size());
3481 
3482       // Add the newly created operand splits to the existing instructions. The
3483       // odd-sized pieces are ordered after the requested NarrowTyArg sized
3484       // pieces.
3485       unsigned InstCount = 0;
3486       for (unsigned J = 0, JE = PartRegs.size(); J != JE; ++J)
3487         NewInsts[InstCount++].addUse(PartRegs[J]);
3488       for (unsigned J = 0, JE = LeftoverRegs.size(); J != JE; ++J)
3489         NewInsts[InstCount++].addUse(LeftoverRegs[J]);
3490     }
3491 
3492     PartRegs.clear();
3493     LeftoverRegs.clear();
3494   }
3495 
3496   // Insert the newly built operations and rebuild the result register.
3497   for (auto &MIB : NewInsts)
3498     MIRBuilder.insertInstr(MIB);
3499 
3500   insertParts(DstReg, DstTy, NarrowTy0, DstRegs, LeftoverTy0, LeftoverDstRegs);
3501 
3502   MI.eraseFromParent();
3503   return Legalized;
3504 }
3505 
3506 LegalizerHelper::LegalizeResult
3507 LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx,
3508                                           LLT NarrowTy) {
3509   if (TypeIdx != 0)
3510     return UnableToLegalize;
3511 
3512   Register DstReg = MI.getOperand(0).getReg();
3513   Register SrcReg = MI.getOperand(1).getReg();
3514   LLT DstTy = MRI.getType(DstReg);
3515   LLT SrcTy = MRI.getType(SrcReg);
3516 
3517   LLT NarrowTy0 = NarrowTy;
3518   LLT NarrowTy1;
3519   unsigned NumParts;
3520 
3521   if (NarrowTy.isVector()) {
3522     // Uneven breakdown not handled.
3523     NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3524     if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements())
3525       return UnableToLegalize;
3526 
3527     NarrowTy1 = LLT::vector(NarrowTy.getElementCount(), SrcTy.getElementType());
3528   } else {
3529     NumParts = DstTy.getNumElements();
3530     NarrowTy1 = SrcTy.getElementType();
3531   }
3532 
3533   SmallVector<Register, 4> SrcRegs, DstRegs;
3534   extractParts(SrcReg, NarrowTy1, NumParts, SrcRegs);
3535 
3536   for (unsigned I = 0; I < NumParts; ++I) {
3537     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3538     MachineInstr *NewInst =
3539         MIRBuilder.buildInstr(MI.getOpcode(), {DstReg}, {SrcRegs[I]});
3540 
3541     NewInst->setFlags(MI.getFlags());
3542     DstRegs.push_back(DstReg);
3543   }
3544 
3545   if (NarrowTy.isVector())
3546     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3547   else
3548     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3549 
3550   MI.eraseFromParent();
3551   return Legalized;
3552 }
3553 
3554 LegalizerHelper::LegalizeResult
3555 LegalizerHelper::fewerElementsVectorCmp(MachineInstr &MI, unsigned TypeIdx,
3556                                         LLT NarrowTy) {
3557   Register DstReg = MI.getOperand(0).getReg();
3558   Register Src0Reg = MI.getOperand(2).getReg();
3559   LLT DstTy = MRI.getType(DstReg);
3560   LLT SrcTy = MRI.getType(Src0Reg);
3561 
3562   unsigned NumParts;
3563   LLT NarrowTy0, NarrowTy1;
3564 
3565   if (TypeIdx == 0) {
3566     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3567     unsigned OldElts = DstTy.getNumElements();
3568 
3569     NarrowTy0 = NarrowTy;
3570     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : DstTy.getNumElements();
3571     NarrowTy1 = NarrowTy.isVector() ? LLT::vector(NarrowTy.getElementCount(),
3572                                                   SrcTy.getScalarSizeInBits())
3573                                     : SrcTy.getElementType();
3574 
3575   } else {
3576     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3577     unsigned OldElts = SrcTy.getNumElements();
3578 
3579     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) :
3580       NarrowTy.getNumElements();
3581     NarrowTy0 =
3582         LLT::vector(NarrowTy.getElementCount(), DstTy.getScalarSizeInBits());
3583     NarrowTy1 = NarrowTy;
3584   }
3585 
3586   // FIXME: Don't know how to handle the situation where the small vectors
3587   // aren't all the same size yet.
3588   if (NarrowTy1.isVector() &&
3589       NarrowTy1.getNumElements() * NumParts != DstTy.getNumElements())
3590     return UnableToLegalize;
3591 
3592   CmpInst::Predicate Pred
3593     = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3594 
3595   SmallVector<Register, 2> Src1Regs, Src2Regs, DstRegs;
3596   extractParts(MI.getOperand(2).getReg(), NarrowTy1, NumParts, Src1Regs);
3597   extractParts(MI.getOperand(3).getReg(), NarrowTy1, NumParts, Src2Regs);
3598 
3599   for (unsigned I = 0; I < NumParts; ++I) {
3600     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3601     DstRegs.push_back(DstReg);
3602 
3603     if (MI.getOpcode() == TargetOpcode::G_ICMP)
3604       MIRBuilder.buildICmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3605     else {
3606       MachineInstr *NewCmp
3607         = MIRBuilder.buildFCmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3608       NewCmp->setFlags(MI.getFlags());
3609     }
3610   }
3611 
3612   if (NarrowTy1.isVector())
3613     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3614   else
3615     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3616 
3617   MI.eraseFromParent();
3618   return Legalized;
3619 }
3620 
3621 LegalizerHelper::LegalizeResult
3622 LegalizerHelper::fewerElementsVectorSelect(MachineInstr &MI, unsigned TypeIdx,
3623                                            LLT NarrowTy) {
3624   Register DstReg = MI.getOperand(0).getReg();
3625   Register CondReg = MI.getOperand(1).getReg();
3626 
3627   unsigned NumParts = 0;
3628   LLT NarrowTy0, NarrowTy1;
3629 
3630   LLT DstTy = MRI.getType(DstReg);
3631   LLT CondTy = MRI.getType(CondReg);
3632   unsigned Size = DstTy.getSizeInBits();
3633 
3634   assert(TypeIdx == 0 || CondTy.isVector());
3635 
3636   if (TypeIdx == 0) {
3637     NarrowTy0 = NarrowTy;
3638     NarrowTy1 = CondTy;
3639 
3640     unsigned NarrowSize = NarrowTy0.getSizeInBits();
3641     // FIXME: Don't know how to handle the situation where the small vectors
3642     // aren't all the same size yet.
3643     if (Size % NarrowSize != 0)
3644       return UnableToLegalize;
3645 
3646     NumParts = Size / NarrowSize;
3647 
3648     // Need to break down the condition type
3649     if (CondTy.isVector()) {
3650       if (CondTy.getNumElements() == NumParts)
3651         NarrowTy1 = CondTy.getElementType();
3652       else
3653         NarrowTy1 =
3654             LLT::vector(CondTy.getElementCount().divideCoefficientBy(NumParts),
3655                         CondTy.getScalarSizeInBits());
3656     }
3657   } else {
3658     NumParts = CondTy.getNumElements();
3659     if (NarrowTy.isVector()) {
3660       // TODO: Handle uneven breakdown.
3661       if (NumParts * NarrowTy.getNumElements() != CondTy.getNumElements())
3662         return UnableToLegalize;
3663 
3664       return UnableToLegalize;
3665     } else {
3666       NarrowTy0 = DstTy.getElementType();
3667       NarrowTy1 = NarrowTy;
3668     }
3669   }
3670 
3671   SmallVector<Register, 2> DstRegs, Src0Regs, Src1Regs, Src2Regs;
3672   if (CondTy.isVector())
3673     extractParts(MI.getOperand(1).getReg(), NarrowTy1, NumParts, Src0Regs);
3674 
3675   extractParts(MI.getOperand(2).getReg(), NarrowTy0, NumParts, Src1Regs);
3676   extractParts(MI.getOperand(3).getReg(), NarrowTy0, NumParts, Src2Regs);
3677 
3678   for (unsigned i = 0; i < NumParts; ++i) {
3679     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3680     MIRBuilder.buildSelect(DstReg, CondTy.isVector() ? Src0Regs[i] : CondReg,
3681                            Src1Regs[i], Src2Regs[i]);
3682     DstRegs.push_back(DstReg);
3683   }
3684 
3685   if (NarrowTy0.isVector())
3686     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3687   else
3688     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3689 
3690   MI.eraseFromParent();
3691   return Legalized;
3692 }
3693 
3694 LegalizerHelper::LegalizeResult
3695 LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
3696                                         LLT NarrowTy) {
3697   const Register DstReg = MI.getOperand(0).getReg();
3698   LLT PhiTy = MRI.getType(DstReg);
3699   LLT LeftoverTy;
3700 
3701   // All of the operands need to have the same number of elements, so if we can
3702   // determine a type breakdown for the result type, we can for all of the
3703   // source types.
3704   int NumParts, NumLeftover;
3705   std::tie(NumParts, NumLeftover)
3706     = getNarrowTypeBreakDown(PhiTy, NarrowTy, LeftoverTy);
3707   if (NumParts < 0)
3708     return UnableToLegalize;
3709 
3710   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3711   SmallVector<MachineInstrBuilder, 4> NewInsts;
3712 
3713   const int TotalNumParts = NumParts + NumLeftover;
3714 
3715   // Insert the new phis in the result block first.
3716   for (int I = 0; I != TotalNumParts; ++I) {
3717     LLT Ty = I < NumParts ? NarrowTy : LeftoverTy;
3718     Register PartDstReg = MRI.createGenericVirtualRegister(Ty);
3719     NewInsts.push_back(MIRBuilder.buildInstr(TargetOpcode::G_PHI)
3720                        .addDef(PartDstReg));
3721     if (I < NumParts)
3722       DstRegs.push_back(PartDstReg);
3723     else
3724       LeftoverDstRegs.push_back(PartDstReg);
3725   }
3726 
3727   MachineBasicBlock *MBB = MI.getParent();
3728   MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI());
3729   insertParts(DstReg, PhiTy, NarrowTy, DstRegs, LeftoverTy, LeftoverDstRegs);
3730 
3731   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3732 
3733   // Insert code to extract the incoming values in each predecessor block.
3734   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3735     PartRegs.clear();
3736     LeftoverRegs.clear();
3737 
3738     Register SrcReg = MI.getOperand(I).getReg();
3739     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
3740     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3741 
3742     LLT Unused;
3743     if (!extractParts(SrcReg, PhiTy, NarrowTy, Unused, PartRegs,
3744                       LeftoverRegs))
3745       return UnableToLegalize;
3746 
3747     // Add the newly created operand splits to the existing instructions. The
3748     // odd-sized pieces are ordered after the requested NarrowTyArg sized
3749     // pieces.
3750     for (int J = 0; J != TotalNumParts; ++J) {
3751       MachineInstrBuilder MIB = NewInsts[J];
3752       MIB.addUse(J < NumParts ? PartRegs[J] : LeftoverRegs[J - NumParts]);
3753       MIB.addMBB(&OpMBB);
3754     }
3755   }
3756 
3757   MI.eraseFromParent();
3758   return Legalized;
3759 }
3760 
3761 LegalizerHelper::LegalizeResult
3762 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3763                                                   unsigned TypeIdx,
3764                                                   LLT NarrowTy) {
3765   if (TypeIdx != 1)
3766     return UnableToLegalize;
3767 
3768   const int NumDst = MI.getNumOperands() - 1;
3769   const Register SrcReg = MI.getOperand(NumDst).getReg();
3770   LLT SrcTy = MRI.getType(SrcReg);
3771 
3772   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3773 
3774   // TODO: Create sequence of extracts.
3775   if (DstTy == NarrowTy)
3776     return UnableToLegalize;
3777 
3778   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3779   if (DstTy == GCDTy) {
3780     // This would just be a copy of the same unmerge.
3781     // TODO: Create extracts, pad with undef and create intermediate merges.
3782     return UnableToLegalize;
3783   }
3784 
3785   auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
3786   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3787   const int PartsPerUnmerge = NumDst / NumUnmerge;
3788 
3789   for (int I = 0; I != NumUnmerge; ++I) {
3790     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3791 
3792     for (int J = 0; J != PartsPerUnmerge; ++J)
3793       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3794     MIB.addUse(Unmerge.getReg(I));
3795   }
3796 
3797   MI.eraseFromParent();
3798   return Legalized;
3799 }
3800 
3801 LegalizerHelper::LegalizeResult
3802 LegalizerHelper::fewerElementsVectorMulo(MachineInstr &MI, unsigned TypeIdx,
3803                                          LLT NarrowTy) {
3804   Register Result = MI.getOperand(0).getReg();
3805   Register Overflow = MI.getOperand(1).getReg();
3806   Register LHS = MI.getOperand(2).getReg();
3807   Register RHS = MI.getOperand(3).getReg();
3808 
3809   LLT SrcTy = MRI.getType(LHS);
3810   if (!SrcTy.isVector())
3811     return UnableToLegalize;
3812 
3813   LLT ElementType = SrcTy.getElementType();
3814   LLT OverflowElementTy = MRI.getType(Overflow).getElementType();
3815   const ElementCount NumResult = SrcTy.getElementCount();
3816   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3817 
3818   // Unmerge the operands to smaller parts of GCD type.
3819   auto UnmergeLHS = MIRBuilder.buildUnmerge(GCDTy, LHS);
3820   auto UnmergeRHS = MIRBuilder.buildUnmerge(GCDTy, RHS);
3821 
3822   const int NumOps = UnmergeLHS->getNumOperands() - 1;
3823   const ElementCount PartsPerUnmerge = NumResult.divideCoefficientBy(NumOps);
3824   LLT OverflowTy = LLT::scalarOrVector(PartsPerUnmerge, OverflowElementTy);
3825   LLT ResultTy = LLT::scalarOrVector(PartsPerUnmerge, ElementType);
3826 
3827   // Perform the operation over unmerged parts.
3828   SmallVector<Register, 8> ResultParts;
3829   SmallVector<Register, 8> OverflowParts;
3830   for (int I = 0; I != NumOps; ++I) {
3831     Register Operand1 = UnmergeLHS->getOperand(I).getReg();
3832     Register Operand2 = UnmergeRHS->getOperand(I).getReg();
3833     auto PartMul = MIRBuilder.buildInstr(MI.getOpcode(), {ResultTy, OverflowTy},
3834                                          {Operand1, Operand2});
3835     ResultParts.push_back(PartMul->getOperand(0).getReg());
3836     OverflowParts.push_back(PartMul->getOperand(1).getReg());
3837   }
3838 
3839   LLT ResultLCMTy = buildLCMMergePieces(SrcTy, NarrowTy, GCDTy, ResultParts);
3840   LLT OverflowLCMTy =
3841       LLT::scalarOrVector(ResultLCMTy.getElementCount(), OverflowElementTy);
3842 
3843   // Recombine the pieces to the original result and overflow registers.
3844   buildWidenedRemergeToDst(Result, ResultLCMTy, ResultParts);
3845   buildWidenedRemergeToDst(Overflow, OverflowLCMTy, OverflowParts);
3846   MI.eraseFromParent();
3847   return Legalized;
3848 }
3849 
3850 // Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces
3851 // a vector
3852 //
3853 // Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with
3854 // undef as necessary.
3855 //
3856 // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
3857 //   -> <2 x s16>
3858 //
3859 // %4:_(s16) = G_IMPLICIT_DEF
3860 // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
3861 // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
3862 // %7:_(<2 x s16>) = G_IMPLICIT_DEF
3863 // %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7
3864 // %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8
3865 LegalizerHelper::LegalizeResult
3866 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
3867                                           LLT NarrowTy) {
3868   Register DstReg = MI.getOperand(0).getReg();
3869   LLT DstTy = MRI.getType(DstReg);
3870   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
3871   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
3872 
3873   // Break into a common type
3874   SmallVector<Register, 16> Parts;
3875   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3876     extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg());
3877 
3878   // Build the requested new merge, padding with undef.
3879   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts,
3880                                   TargetOpcode::G_ANYEXT);
3881 
3882   // Pack into the original result register.
3883   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
3884 
3885   MI.eraseFromParent();
3886   return Legalized;
3887 }
3888 
3889 LegalizerHelper::LegalizeResult
3890 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
3891                                                            unsigned TypeIdx,
3892                                                            LLT NarrowVecTy) {
3893   Register DstReg = MI.getOperand(0).getReg();
3894   Register SrcVec = MI.getOperand(1).getReg();
3895   Register InsertVal;
3896   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
3897 
3898   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
3899   if (IsInsert)
3900     InsertVal = MI.getOperand(2).getReg();
3901 
3902   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
3903 
3904   // TODO: Handle total scalarization case.
3905   if (!NarrowVecTy.isVector())
3906     return UnableToLegalize;
3907 
3908   LLT VecTy = MRI.getType(SrcVec);
3909 
3910   // If the index is a constant, we can really break this down as you would
3911   // expect, and index into the target size pieces.
3912   int64_t IdxVal;
3913   auto MaybeCst =
3914       getConstantVRegValWithLookThrough(Idx, MRI, /*LookThroughInstrs*/ true,
3915                                         /*HandleFConstants*/ false);
3916   if (MaybeCst) {
3917     IdxVal = MaybeCst->Value.getSExtValue();
3918     // Avoid out of bounds indexing the pieces.
3919     if (IdxVal >= VecTy.getNumElements()) {
3920       MIRBuilder.buildUndef(DstReg);
3921       MI.eraseFromParent();
3922       return Legalized;
3923     }
3924 
3925     SmallVector<Register, 8> VecParts;
3926     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
3927 
3928     // Build a sequence of NarrowTy pieces in VecParts for this operand.
3929     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
3930                                     TargetOpcode::G_ANYEXT);
3931 
3932     unsigned NewNumElts = NarrowVecTy.getNumElements();
3933 
3934     LLT IdxTy = MRI.getType(Idx);
3935     int64_t PartIdx = IdxVal / NewNumElts;
3936     auto NewIdx =
3937         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
3938 
3939     if (IsInsert) {
3940       LLT PartTy = MRI.getType(VecParts[PartIdx]);
3941 
3942       // Use the adjusted index to insert into one of the subvectors.
3943       auto InsertPart = MIRBuilder.buildInsertVectorElement(
3944           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
3945       VecParts[PartIdx] = InsertPart.getReg(0);
3946 
3947       // Recombine the inserted subvector with the others to reform the result
3948       // vector.
3949       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
3950     } else {
3951       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
3952     }
3953 
3954     MI.eraseFromParent();
3955     return Legalized;
3956   }
3957 
3958   // With a variable index, we can't perform the operation in a smaller type, so
3959   // we're forced to expand this.
3960   //
3961   // TODO: We could emit a chain of compare/select to figure out which piece to
3962   // index.
3963   return lowerExtractInsertVectorElt(MI);
3964 }
3965 
3966 LegalizerHelper::LegalizeResult
3967 LegalizerHelper::reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx,
3968                                       LLT NarrowTy) {
3969   // FIXME: Don't know how to handle secondary types yet.
3970   if (TypeIdx != 0)
3971     return UnableToLegalize;
3972 
3973   MachineMemOperand *MMO = *MI.memoperands_begin();
3974 
3975   // This implementation doesn't work for atomics. Give up instead of doing
3976   // something invalid.
3977   if (MMO->isAtomic())
3978     return UnableToLegalize;
3979 
3980   bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
3981   Register ValReg = MI.getOperand(0).getReg();
3982   Register AddrReg = MI.getOperand(1).getReg();
3983   LLT ValTy = MRI.getType(ValReg);
3984 
3985   // FIXME: Do we need a distinct NarrowMemory legalize action?
3986   if (ValTy.getSizeInBits() != 8 * MMO->getSize()) {
3987     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
3988     return UnableToLegalize;
3989   }
3990 
3991   int NumParts = -1;
3992   int NumLeftover = -1;
3993   LLT LeftoverTy;
3994   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
3995   if (IsLoad) {
3996     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
3997   } else {
3998     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
3999                      NarrowLeftoverRegs)) {
4000       NumParts = NarrowRegs.size();
4001       NumLeftover = NarrowLeftoverRegs.size();
4002     }
4003   }
4004 
4005   if (NumParts == -1)
4006     return UnableToLegalize;
4007 
4008   LLT PtrTy = MRI.getType(AddrReg);
4009   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4010 
4011   unsigned TotalSize = ValTy.getSizeInBits();
4012 
4013   // Split the load/store into PartTy sized pieces starting at Offset. If this
4014   // is a load, return the new registers in ValRegs. For a store, each elements
4015   // of ValRegs should be PartTy. Returns the next offset that needs to be
4016   // handled.
4017   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4018                              unsigned Offset) -> unsigned {
4019     MachineFunction &MF = MIRBuilder.getMF();
4020     unsigned PartSize = PartTy.getSizeInBits();
4021     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4022          Offset += PartSize, ++Idx) {
4023       unsigned ByteOffset = Offset / 8;
4024       Register NewAddrReg;
4025 
4026       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4027 
4028       MachineMemOperand *NewMMO =
4029         MF.getMachineMemOperand(MMO, ByteOffset, PartTy);
4030 
4031       if (IsLoad) {
4032         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4033         ValRegs.push_back(Dst);
4034         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4035       } else {
4036         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4037       }
4038     }
4039 
4040     return Offset;
4041   };
4042 
4043   unsigned HandledOffset = splitTypePieces(NarrowTy, NarrowRegs, 0);
4044 
4045   // Handle the rest of the register if this isn't an even type breakdown.
4046   if (LeftoverTy.isValid())
4047     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, HandledOffset);
4048 
4049   if (IsLoad) {
4050     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4051                 LeftoverTy, NarrowLeftoverRegs);
4052   }
4053 
4054   MI.eraseFromParent();
4055   return Legalized;
4056 }
4057 
4058 LegalizerHelper::LegalizeResult
4059 LegalizerHelper::reduceOperationWidth(MachineInstr &MI, unsigned int TypeIdx,
4060                                       LLT NarrowTy) {
4061   assert(TypeIdx == 0 && "only one type index expected");
4062 
4063   const unsigned Opc = MI.getOpcode();
4064   const int NumDefOps = MI.getNumExplicitDefs();
4065   const int NumSrcOps = MI.getNumOperands() - NumDefOps;
4066   const unsigned Flags = MI.getFlags();
4067   const unsigned NarrowSize = NarrowTy.getSizeInBits();
4068   const LLT NarrowScalarTy = LLT::scalar(NarrowSize);
4069 
4070   assert(MI.getNumOperands() <= 4 && "expected instruction with either 1 "
4071                                      "result and 1-3 sources or 2 results and "
4072                                      "1-2 sources");
4073 
4074   SmallVector<Register, 2> DstRegs;
4075   for (int I = 0; I < NumDefOps; ++I)
4076     DstRegs.push_back(MI.getOperand(I).getReg());
4077 
4078   // First of all check whether we are narrowing (changing the element type)
4079   // or reducing the vector elements
4080   const LLT DstTy = MRI.getType(DstRegs[0]);
4081   const bool IsNarrow = NarrowTy.getScalarType() != DstTy.getScalarType();
4082 
4083   SmallVector<Register, 8> ExtractedRegs[3];
4084   SmallVector<Register, 8> Parts;
4085 
4086   // Break down all the sources into NarrowTy pieces we can operate on. This may
4087   // involve creating merges to a wider type, padded with undef.
4088   for (int I = 0; I != NumSrcOps; ++I) {
4089     Register SrcReg = MI.getOperand(I + NumDefOps).getReg();
4090     LLT SrcTy = MRI.getType(SrcReg);
4091 
4092     // The type to narrow SrcReg to. For narrowing, this is a smaller scalar.
4093     // For fewerElements, this is a smaller vector with the same element type.
4094     LLT OpNarrowTy;
4095     if (IsNarrow) {
4096       OpNarrowTy = NarrowScalarTy;
4097 
4098       // In case of narrowing, we need to cast vectors to scalars for this to
4099       // work properly
4100       // FIXME: Can we do without the bitcast here if we're narrowing?
4101       if (SrcTy.isVector()) {
4102         SrcTy = LLT::scalar(SrcTy.getSizeInBits());
4103         SrcReg = MIRBuilder.buildBitcast(SrcTy, SrcReg).getReg(0);
4104       }
4105     } else {
4106       auto NarrowEC = NarrowTy.isVector() ? NarrowTy.getElementCount()
4107                                           : ElementCount::getFixed(1);
4108       OpNarrowTy = LLT::scalarOrVector(NarrowEC, SrcTy.getScalarType());
4109     }
4110 
4111     LLT GCDTy = extractGCDType(ExtractedRegs[I], SrcTy, OpNarrowTy, SrcReg);
4112 
4113     // Build a sequence of NarrowTy pieces in ExtractedRegs for this operand.
4114     buildLCMMergePieces(SrcTy, OpNarrowTy, GCDTy, ExtractedRegs[I],
4115                         TargetOpcode::G_ANYEXT);
4116   }
4117 
4118   SmallVector<Register, 8> ResultRegs[2];
4119 
4120   // Input operands for each sub-instruction.
4121   SmallVector<SrcOp, 4> InputRegs(NumSrcOps, Register());
4122 
4123   int NumParts = ExtractedRegs[0].size();
4124   const unsigned DstSize = DstTy.getSizeInBits();
4125   const LLT DstScalarTy = LLT::scalar(DstSize);
4126 
4127   // Narrowing needs to use scalar types
4128   LLT DstLCMTy, NarrowDstTy;
4129   if (IsNarrow) {
4130     DstLCMTy = getLCMType(DstScalarTy, NarrowScalarTy);
4131     NarrowDstTy = NarrowScalarTy;
4132   } else {
4133     DstLCMTy = getLCMType(DstTy, NarrowTy);
4134     NarrowDstTy = NarrowTy;
4135   }
4136 
4137   // We widened the source registers to satisfy merge/unmerge size
4138   // constraints. We'll have some extra fully undef parts.
4139   const int NumRealParts = (DstSize + NarrowSize - 1) / NarrowSize;
4140 
4141   for (int I = 0; I != NumRealParts; ++I) {
4142     // Emit this instruction on each of the split pieces.
4143     for (int J = 0; J != NumSrcOps; ++J)
4144       InputRegs[J] = ExtractedRegs[J][I];
4145 
4146     MachineInstrBuilder Inst;
4147     if (NumDefOps == 1)
4148       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy}, InputRegs, Flags);
4149     else
4150       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy, NarrowDstTy}, InputRegs,
4151                                    Flags);
4152 
4153     for (int J = 0; J != NumDefOps; ++J)
4154       ResultRegs[J].push_back(Inst.getReg(J));
4155   }
4156 
4157   // Fill out the widened result with undef instead of creating instructions
4158   // with undef inputs.
4159   int NumUndefParts = NumParts - NumRealParts;
4160   if (NumUndefParts != 0) {
4161     Register Undef = MIRBuilder.buildUndef(NarrowDstTy).getReg(0);
4162     for (int I = 0; I != NumDefOps; ++I)
4163       ResultRegs[I].append(NumUndefParts, Undef);
4164   }
4165 
4166   // Extract the possibly padded result. Use a scratch register if we need to do
4167   // a final bitcast, otherwise use the original result register.
4168   Register MergeDstReg;
4169   for (int I = 0; I != NumDefOps; ++I) {
4170     if (IsNarrow && DstTy.isVector())
4171       MergeDstReg = MRI.createGenericVirtualRegister(DstScalarTy);
4172     else
4173       MergeDstReg = DstRegs[I];
4174 
4175     buildWidenedRemergeToDst(MergeDstReg, DstLCMTy, ResultRegs[I]);
4176 
4177     // Recast to vector if we narrowed a vector
4178     if (IsNarrow && DstTy.isVector())
4179       MIRBuilder.buildBitcast(DstRegs[I], MergeDstReg);
4180   }
4181 
4182   MI.eraseFromParent();
4183   return Legalized;
4184 }
4185 
4186 LegalizerHelper::LegalizeResult
4187 LegalizerHelper::fewerElementsVectorSextInReg(MachineInstr &MI, unsigned TypeIdx,
4188                                               LLT NarrowTy) {
4189   Register DstReg = MI.getOperand(0).getReg();
4190   Register SrcReg = MI.getOperand(1).getReg();
4191   int64_t Imm = MI.getOperand(2).getImm();
4192 
4193   LLT DstTy = MRI.getType(DstReg);
4194 
4195   SmallVector<Register, 8> Parts;
4196   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
4197   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts);
4198 
4199   for (Register &R : Parts)
4200     R = MIRBuilder.buildSExtInReg(NarrowTy, R, Imm).getReg(0);
4201 
4202   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4203 
4204   MI.eraseFromParent();
4205   return Legalized;
4206 }
4207 
4208 LegalizerHelper::LegalizeResult
4209 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4210                                      LLT NarrowTy) {
4211   using namespace TargetOpcode;
4212 
4213   switch (MI.getOpcode()) {
4214   case G_IMPLICIT_DEF:
4215     return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy);
4216   case G_TRUNC:
4217   case G_AND:
4218   case G_OR:
4219   case G_XOR:
4220   case G_ADD:
4221   case G_SUB:
4222   case G_MUL:
4223   case G_PTR_ADD:
4224   case G_SMULH:
4225   case G_UMULH:
4226   case G_FADD:
4227   case G_FMUL:
4228   case G_FSUB:
4229   case G_FNEG:
4230   case G_FABS:
4231   case G_FCANONICALIZE:
4232   case G_FDIV:
4233   case G_FREM:
4234   case G_FMA:
4235   case G_FMAD:
4236   case G_FPOW:
4237   case G_FEXP:
4238   case G_FEXP2:
4239   case G_FLOG:
4240   case G_FLOG2:
4241   case G_FLOG10:
4242   case G_FNEARBYINT:
4243   case G_FCEIL:
4244   case G_FFLOOR:
4245   case G_FRINT:
4246   case G_INTRINSIC_ROUND:
4247   case G_INTRINSIC_ROUNDEVEN:
4248   case G_INTRINSIC_TRUNC:
4249   case G_FCOS:
4250   case G_FSIN:
4251   case G_FSQRT:
4252   case G_BSWAP:
4253   case G_BITREVERSE:
4254   case G_SDIV:
4255   case G_UDIV:
4256   case G_SREM:
4257   case G_UREM:
4258   case G_SDIVREM:
4259   case G_UDIVREM:
4260   case G_SMIN:
4261   case G_SMAX:
4262   case G_UMIN:
4263   case G_UMAX:
4264   case G_ABS:
4265   case G_FMINNUM:
4266   case G_FMAXNUM:
4267   case G_FMINNUM_IEEE:
4268   case G_FMAXNUM_IEEE:
4269   case G_FMINIMUM:
4270   case G_FMAXIMUM:
4271   case G_FSHL:
4272   case G_FSHR:
4273   case G_FREEZE:
4274   case G_SADDSAT:
4275   case G_SSUBSAT:
4276   case G_UADDSAT:
4277   case G_USUBSAT:
4278     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
4279   case G_UMULO:
4280   case G_SMULO:
4281     return fewerElementsVectorMulo(MI, TypeIdx, NarrowTy);
4282   case G_SHL:
4283   case G_LSHR:
4284   case G_ASHR:
4285   case G_SSHLSAT:
4286   case G_USHLSAT:
4287   case G_CTLZ:
4288   case G_CTLZ_ZERO_UNDEF:
4289   case G_CTTZ:
4290   case G_CTTZ_ZERO_UNDEF:
4291   case G_CTPOP:
4292   case G_FCOPYSIGN:
4293     return fewerElementsVectorMultiEltType(MI, TypeIdx, NarrowTy);
4294   case G_ZEXT:
4295   case G_SEXT:
4296   case G_ANYEXT:
4297   case G_FPEXT:
4298   case G_FPTRUNC:
4299   case G_SITOFP:
4300   case G_UITOFP:
4301   case G_FPTOSI:
4302   case G_FPTOUI:
4303   case G_INTTOPTR:
4304   case G_PTRTOINT:
4305   case G_ADDRSPACE_CAST:
4306     return fewerElementsVectorCasts(MI, TypeIdx, NarrowTy);
4307   case G_ICMP:
4308   case G_FCMP:
4309     return fewerElementsVectorCmp(MI, TypeIdx, NarrowTy);
4310   case G_SELECT:
4311     return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy);
4312   case G_PHI:
4313     return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy);
4314   case G_UNMERGE_VALUES:
4315     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4316   case G_BUILD_VECTOR:
4317     assert(TypeIdx == 0 && "not a vector type index");
4318     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4319   case G_CONCAT_VECTORS:
4320     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4321       return UnableToLegalize;
4322     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4323   case G_EXTRACT_VECTOR_ELT:
4324   case G_INSERT_VECTOR_ELT:
4325     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4326   case G_LOAD:
4327   case G_STORE:
4328     return reduceLoadStoreWidth(MI, TypeIdx, NarrowTy);
4329   case G_SEXT_INREG:
4330     return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy);
4331   GISEL_VECREDUCE_CASES_NONSEQ
4332     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4333   case G_SHUFFLE_VECTOR:
4334     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4335   default:
4336     return UnableToLegalize;
4337   }
4338 }
4339 
4340 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4341     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4342   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4343   if (TypeIdx != 0)
4344     return UnableToLegalize;
4345 
4346   Register DstReg = MI.getOperand(0).getReg();
4347   Register Src1Reg = MI.getOperand(1).getReg();
4348   Register Src2Reg = MI.getOperand(2).getReg();
4349   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4350   LLT DstTy = MRI.getType(DstReg);
4351   LLT Src1Ty = MRI.getType(Src1Reg);
4352   LLT Src2Ty = MRI.getType(Src2Reg);
4353   // The shuffle should be canonicalized by now.
4354   if (DstTy != Src1Ty)
4355     return UnableToLegalize;
4356   if (DstTy != Src2Ty)
4357     return UnableToLegalize;
4358 
4359   if (!isPowerOf2_32(DstTy.getNumElements()))
4360     return UnableToLegalize;
4361 
4362   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4363   // Further legalization attempts will be needed to do split further.
4364   NarrowTy =
4365       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4366   unsigned NewElts = NarrowTy.getNumElements();
4367 
4368   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4369   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4370   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4371   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4372                         SplitSrc2Regs[1]};
4373 
4374   Register Hi, Lo;
4375 
4376   // If Lo or Hi uses elements from at most two of the four input vectors, then
4377   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4378   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4379   SmallVector<int, 16> Ops;
4380   for (unsigned High = 0; High < 2; ++High) {
4381     Register &Output = High ? Hi : Lo;
4382 
4383     // Build a shuffle mask for the output, discovering on the fly which
4384     // input vectors to use as shuffle operands (recorded in InputUsed).
4385     // If building a suitable shuffle vector proves too hard, then bail
4386     // out with useBuildVector set.
4387     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4388     unsigned FirstMaskIdx = High * NewElts;
4389     bool UseBuildVector = false;
4390     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4391       // The mask element.  This indexes into the input.
4392       int Idx = Mask[FirstMaskIdx + MaskOffset];
4393 
4394       // The input vector this mask element indexes into.
4395       unsigned Input = (unsigned)Idx / NewElts;
4396 
4397       if (Input >= array_lengthof(Inputs)) {
4398         // The mask element does not index into any input vector.
4399         Ops.push_back(-1);
4400         continue;
4401       }
4402 
4403       // Turn the index into an offset from the start of the input vector.
4404       Idx -= Input * NewElts;
4405 
4406       // Find or create a shuffle vector operand to hold this input.
4407       unsigned OpNo;
4408       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4409         if (InputUsed[OpNo] == Input) {
4410           // This input vector is already an operand.
4411           break;
4412         } else if (InputUsed[OpNo] == -1U) {
4413           // Create a new operand for this input vector.
4414           InputUsed[OpNo] = Input;
4415           break;
4416         }
4417       }
4418 
4419       if (OpNo >= array_lengthof(InputUsed)) {
4420         // More than two input vectors used!  Give up on trying to create a
4421         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4422         UseBuildVector = true;
4423         break;
4424       }
4425 
4426       // Add the mask index for the new shuffle vector.
4427       Ops.push_back(Idx + OpNo * NewElts);
4428     }
4429 
4430     if (UseBuildVector) {
4431       LLT EltTy = NarrowTy.getElementType();
4432       SmallVector<Register, 16> SVOps;
4433 
4434       // Extract the input elements by hand.
4435       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4436         // The mask element.  This indexes into the input.
4437         int Idx = Mask[FirstMaskIdx + MaskOffset];
4438 
4439         // The input vector this mask element indexes into.
4440         unsigned Input = (unsigned)Idx / NewElts;
4441 
4442         if (Input >= array_lengthof(Inputs)) {
4443           // The mask element is "undef" or indexes off the end of the input.
4444           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4445           continue;
4446         }
4447 
4448         // Turn the index into an offset from the start of the input vector.
4449         Idx -= Input * NewElts;
4450 
4451         // Extract the vector element by hand.
4452         SVOps.push_back(MIRBuilder
4453                             .buildExtractVectorElement(
4454                                 EltTy, Inputs[Input],
4455                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4456                             .getReg(0));
4457       }
4458 
4459       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4460       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4461     } else if (InputUsed[0] == -1U) {
4462       // No input vectors were used! The result is undefined.
4463       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4464     } else {
4465       Register Op0 = Inputs[InputUsed[0]];
4466       // If only one input was used, use an undefined vector for the other.
4467       Register Op1 = InputUsed[1] == -1U
4468                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4469                          : Inputs[InputUsed[1]];
4470       // At least one input vector was used. Create a new shuffle vector.
4471       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4472     }
4473 
4474     Ops.clear();
4475   }
4476 
4477   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4478   MI.eraseFromParent();
4479   return Legalized;
4480 }
4481 
4482 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4483     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4484   unsigned Opc = MI.getOpcode();
4485   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4486          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4487          "Sequential reductions not expected");
4488 
4489   if (TypeIdx != 1)
4490     return UnableToLegalize;
4491 
4492   // The semantics of the normal non-sequential reductions allow us to freely
4493   // re-associate the operation.
4494   Register SrcReg = MI.getOperand(1).getReg();
4495   LLT SrcTy = MRI.getType(SrcReg);
4496   Register DstReg = MI.getOperand(0).getReg();
4497   LLT DstTy = MRI.getType(DstReg);
4498 
4499   if (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0)
4500     return UnableToLegalize;
4501 
4502   SmallVector<Register> SplitSrcs;
4503   const unsigned NumParts = SrcTy.getNumElements() / NarrowTy.getNumElements();
4504   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4505   SmallVector<Register> PartialReductions;
4506   for (unsigned Part = 0; Part < NumParts; ++Part) {
4507     PartialReductions.push_back(
4508         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4509   }
4510 
4511   unsigned ScalarOpc;
4512   switch (Opc) {
4513   case TargetOpcode::G_VECREDUCE_FADD:
4514     ScalarOpc = TargetOpcode::G_FADD;
4515     break;
4516   case TargetOpcode::G_VECREDUCE_FMUL:
4517     ScalarOpc = TargetOpcode::G_FMUL;
4518     break;
4519   case TargetOpcode::G_VECREDUCE_FMAX:
4520     ScalarOpc = TargetOpcode::G_FMAXNUM;
4521     break;
4522   case TargetOpcode::G_VECREDUCE_FMIN:
4523     ScalarOpc = TargetOpcode::G_FMINNUM;
4524     break;
4525   case TargetOpcode::G_VECREDUCE_ADD:
4526     ScalarOpc = TargetOpcode::G_ADD;
4527     break;
4528   case TargetOpcode::G_VECREDUCE_MUL:
4529     ScalarOpc = TargetOpcode::G_MUL;
4530     break;
4531   case TargetOpcode::G_VECREDUCE_AND:
4532     ScalarOpc = TargetOpcode::G_AND;
4533     break;
4534   case TargetOpcode::G_VECREDUCE_OR:
4535     ScalarOpc = TargetOpcode::G_OR;
4536     break;
4537   case TargetOpcode::G_VECREDUCE_XOR:
4538     ScalarOpc = TargetOpcode::G_XOR;
4539     break;
4540   case TargetOpcode::G_VECREDUCE_SMAX:
4541     ScalarOpc = TargetOpcode::G_SMAX;
4542     break;
4543   case TargetOpcode::G_VECREDUCE_SMIN:
4544     ScalarOpc = TargetOpcode::G_SMIN;
4545     break;
4546   case TargetOpcode::G_VECREDUCE_UMAX:
4547     ScalarOpc = TargetOpcode::G_UMAX;
4548     break;
4549   case TargetOpcode::G_VECREDUCE_UMIN:
4550     ScalarOpc = TargetOpcode::G_UMIN;
4551     break;
4552   default:
4553     LLVM_DEBUG(dbgs() << "Can't legalize: unknown reduction kind.\n");
4554     return UnableToLegalize;
4555   }
4556 
4557   // If the types involved are powers of 2, we can generate intermediate vector
4558   // ops, before generating a final reduction operation.
4559   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4560       isPowerOf2_32(NarrowTy.getNumElements())) {
4561     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4562   }
4563 
4564   Register Acc = PartialReductions[0];
4565   for (unsigned Part = 1; Part < NumParts; ++Part) {
4566     if (Part == NumParts - 1) {
4567       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4568                             {Acc, PartialReductions[Part]});
4569     } else {
4570       Acc = MIRBuilder
4571                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4572                 .getReg(0);
4573     }
4574   }
4575   MI.eraseFromParent();
4576   return Legalized;
4577 }
4578 
4579 LegalizerHelper::LegalizeResult
4580 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4581                                         LLT SrcTy, LLT NarrowTy,
4582                                         unsigned ScalarOpc) {
4583   SmallVector<Register> SplitSrcs;
4584   // Split the sources into NarrowTy size pieces.
4585   extractParts(SrcReg, NarrowTy,
4586                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4587   // We're going to do a tree reduction using vector operations until we have
4588   // one NarrowTy size value left.
4589   while (SplitSrcs.size() > 1) {
4590     SmallVector<Register> PartialRdxs;
4591     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4592       Register LHS = SplitSrcs[Idx];
4593       Register RHS = SplitSrcs[Idx + 1];
4594       // Create the intermediate vector op.
4595       Register Res =
4596           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4597       PartialRdxs.push_back(Res);
4598     }
4599     SplitSrcs = std::move(PartialRdxs);
4600   }
4601   // Finally generate the requested NarrowTy based reduction.
4602   Observer.changingInstr(MI);
4603   MI.getOperand(1).setReg(SplitSrcs[0]);
4604   Observer.changedInstr(MI);
4605   return Legalized;
4606 }
4607 
4608 LegalizerHelper::LegalizeResult
4609 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4610                                              const LLT HalfTy, const LLT AmtTy) {
4611 
4612   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4613   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4614   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4615 
4616   if (Amt.isNullValue()) {
4617     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4618     MI.eraseFromParent();
4619     return Legalized;
4620   }
4621 
4622   LLT NVT = HalfTy;
4623   unsigned NVTBits = HalfTy.getSizeInBits();
4624   unsigned VTBits = 2 * NVTBits;
4625 
4626   SrcOp Lo(Register(0)), Hi(Register(0));
4627   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4628     if (Amt.ugt(VTBits)) {
4629       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4630     } else if (Amt.ugt(NVTBits)) {
4631       Lo = MIRBuilder.buildConstant(NVT, 0);
4632       Hi = MIRBuilder.buildShl(NVT, InL,
4633                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4634     } else if (Amt == NVTBits) {
4635       Lo = MIRBuilder.buildConstant(NVT, 0);
4636       Hi = InL;
4637     } else {
4638       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4639       auto OrLHS =
4640           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4641       auto OrRHS = MIRBuilder.buildLShr(
4642           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4643       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4644     }
4645   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4646     if (Amt.ugt(VTBits)) {
4647       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4648     } else if (Amt.ugt(NVTBits)) {
4649       Lo = MIRBuilder.buildLShr(NVT, InH,
4650                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4651       Hi = MIRBuilder.buildConstant(NVT, 0);
4652     } else if (Amt == NVTBits) {
4653       Lo = InH;
4654       Hi = MIRBuilder.buildConstant(NVT, 0);
4655     } else {
4656       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4657 
4658       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4659       auto OrRHS = MIRBuilder.buildShl(
4660           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4661 
4662       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4663       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4664     }
4665   } else {
4666     if (Amt.ugt(VTBits)) {
4667       Hi = Lo = MIRBuilder.buildAShr(
4668           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4669     } else if (Amt.ugt(NVTBits)) {
4670       Lo = MIRBuilder.buildAShr(NVT, InH,
4671                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4672       Hi = MIRBuilder.buildAShr(NVT, InH,
4673                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4674     } else if (Amt == NVTBits) {
4675       Lo = InH;
4676       Hi = MIRBuilder.buildAShr(NVT, InH,
4677                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4678     } else {
4679       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4680 
4681       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4682       auto OrRHS = MIRBuilder.buildShl(
4683           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4684 
4685       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4686       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4687     }
4688   }
4689 
4690   MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4691   MI.eraseFromParent();
4692 
4693   return Legalized;
4694 }
4695 
4696 // TODO: Optimize if constant shift amount.
4697 LegalizerHelper::LegalizeResult
4698 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4699                                    LLT RequestedTy) {
4700   if (TypeIdx == 1) {
4701     Observer.changingInstr(MI);
4702     narrowScalarSrc(MI, RequestedTy, 2);
4703     Observer.changedInstr(MI);
4704     return Legalized;
4705   }
4706 
4707   Register DstReg = MI.getOperand(0).getReg();
4708   LLT DstTy = MRI.getType(DstReg);
4709   if (DstTy.isVector())
4710     return UnableToLegalize;
4711 
4712   Register Amt = MI.getOperand(2).getReg();
4713   LLT ShiftAmtTy = MRI.getType(Amt);
4714   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4715   if (DstEltSize % 2 != 0)
4716     return UnableToLegalize;
4717 
4718   // Ignore the input type. We can only go to exactly half the size of the
4719   // input. If that isn't small enough, the resulting pieces will be further
4720   // legalized.
4721   const unsigned NewBitSize = DstEltSize / 2;
4722   const LLT HalfTy = LLT::scalar(NewBitSize);
4723   const LLT CondTy = LLT::scalar(1);
4724 
4725   if (const MachineInstr *KShiftAmt =
4726           getOpcodeDef(TargetOpcode::G_CONSTANT, Amt, MRI)) {
4727     return narrowScalarShiftByConstant(
4728         MI, KShiftAmt->getOperand(1).getCImm()->getValue(), HalfTy, ShiftAmtTy);
4729   }
4730 
4731   // TODO: Expand with known bits.
4732 
4733   // Handle the fully general expansion by an unknown amount.
4734   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4735 
4736   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4737   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4738   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4739 
4740   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4741   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4742 
4743   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4744   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4745   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4746 
4747   Register ResultRegs[2];
4748   switch (MI.getOpcode()) {
4749   case TargetOpcode::G_SHL: {
4750     // Short: ShAmt < NewBitSize
4751     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4752 
4753     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4754     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4755     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4756 
4757     // Long: ShAmt >= NewBitSize
4758     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4759     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4760 
4761     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4762     auto Hi = MIRBuilder.buildSelect(
4763         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4764 
4765     ResultRegs[0] = Lo.getReg(0);
4766     ResultRegs[1] = Hi.getReg(0);
4767     break;
4768   }
4769   case TargetOpcode::G_LSHR:
4770   case TargetOpcode::G_ASHR: {
4771     // Short: ShAmt < NewBitSize
4772     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4773 
4774     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4775     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4776     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4777 
4778     // Long: ShAmt >= NewBitSize
4779     MachineInstrBuilder HiL;
4780     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4781       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4782     } else {
4783       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4784       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4785     }
4786     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4787                                      {InH, AmtExcess});     // Lo from Hi part.
4788 
4789     auto Lo = MIRBuilder.buildSelect(
4790         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4791 
4792     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4793 
4794     ResultRegs[0] = Lo.getReg(0);
4795     ResultRegs[1] = Hi.getReg(0);
4796     break;
4797   }
4798   default:
4799     llvm_unreachable("not a shift");
4800   }
4801 
4802   MIRBuilder.buildMerge(DstReg, ResultRegs);
4803   MI.eraseFromParent();
4804   return Legalized;
4805 }
4806 
4807 LegalizerHelper::LegalizeResult
4808 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
4809                                        LLT MoreTy) {
4810   assert(TypeIdx == 0 && "Expecting only Idx 0");
4811 
4812   Observer.changingInstr(MI);
4813   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4814     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
4815     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
4816     moreElementsVectorSrc(MI, MoreTy, I);
4817   }
4818 
4819   MachineBasicBlock &MBB = *MI.getParent();
4820   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
4821   moreElementsVectorDst(MI, MoreTy, 0);
4822   Observer.changedInstr(MI);
4823   return Legalized;
4824 }
4825 
4826 LegalizerHelper::LegalizeResult
4827 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
4828                                     LLT MoreTy) {
4829   unsigned Opc = MI.getOpcode();
4830   switch (Opc) {
4831   case TargetOpcode::G_IMPLICIT_DEF:
4832   case TargetOpcode::G_LOAD: {
4833     if (TypeIdx != 0)
4834       return UnableToLegalize;
4835     Observer.changingInstr(MI);
4836     moreElementsVectorDst(MI, MoreTy, 0);
4837     Observer.changedInstr(MI);
4838     return Legalized;
4839   }
4840   case TargetOpcode::G_STORE:
4841     if (TypeIdx != 0)
4842       return UnableToLegalize;
4843     Observer.changingInstr(MI);
4844     moreElementsVectorSrc(MI, MoreTy, 0);
4845     Observer.changedInstr(MI);
4846     return Legalized;
4847   case TargetOpcode::G_AND:
4848   case TargetOpcode::G_OR:
4849   case TargetOpcode::G_XOR:
4850   case TargetOpcode::G_SMIN:
4851   case TargetOpcode::G_SMAX:
4852   case TargetOpcode::G_UMIN:
4853   case TargetOpcode::G_UMAX:
4854   case TargetOpcode::G_FMINNUM:
4855   case TargetOpcode::G_FMAXNUM:
4856   case TargetOpcode::G_FMINNUM_IEEE:
4857   case TargetOpcode::G_FMAXNUM_IEEE:
4858   case TargetOpcode::G_FMINIMUM:
4859   case TargetOpcode::G_FMAXIMUM: {
4860     Observer.changingInstr(MI);
4861     moreElementsVectorSrc(MI, MoreTy, 1);
4862     moreElementsVectorSrc(MI, MoreTy, 2);
4863     moreElementsVectorDst(MI, MoreTy, 0);
4864     Observer.changedInstr(MI);
4865     return Legalized;
4866   }
4867   case TargetOpcode::G_EXTRACT:
4868     if (TypeIdx != 1)
4869       return UnableToLegalize;
4870     Observer.changingInstr(MI);
4871     moreElementsVectorSrc(MI, MoreTy, 1);
4872     Observer.changedInstr(MI);
4873     return Legalized;
4874   case TargetOpcode::G_INSERT:
4875   case TargetOpcode::G_FREEZE:
4876     if (TypeIdx != 0)
4877       return UnableToLegalize;
4878     Observer.changingInstr(MI);
4879     moreElementsVectorSrc(MI, MoreTy, 1);
4880     moreElementsVectorDst(MI, MoreTy, 0);
4881     Observer.changedInstr(MI);
4882     return Legalized;
4883   case TargetOpcode::G_SELECT:
4884     if (TypeIdx != 0)
4885       return UnableToLegalize;
4886     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4887       return UnableToLegalize;
4888 
4889     Observer.changingInstr(MI);
4890     moreElementsVectorSrc(MI, MoreTy, 2);
4891     moreElementsVectorSrc(MI, MoreTy, 3);
4892     moreElementsVectorDst(MI, MoreTy, 0);
4893     Observer.changedInstr(MI);
4894     return Legalized;
4895   case TargetOpcode::G_UNMERGE_VALUES: {
4896     if (TypeIdx != 1)
4897       return UnableToLegalize;
4898 
4899     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4900     int NumDst = MI.getNumOperands() - 1;
4901     moreElementsVectorSrc(MI, MoreTy, NumDst);
4902 
4903     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
4904     for (int I = 0; I != NumDst; ++I)
4905       MIB.addDef(MI.getOperand(I).getReg());
4906 
4907     int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits();
4908     for (int I = NumDst; I != NewNumDst; ++I)
4909       MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
4910 
4911     MIB.addUse(MI.getOperand(NumDst).getReg());
4912     MI.eraseFromParent();
4913     return Legalized;
4914   }
4915   case TargetOpcode::G_PHI:
4916     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
4917   case TargetOpcode::G_SHUFFLE_VECTOR:
4918     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
4919   default:
4920     return UnableToLegalize;
4921   }
4922 }
4923 
4924 LegalizerHelper::LegalizeResult
4925 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
4926                                            unsigned int TypeIdx, LLT MoreTy) {
4927   if (TypeIdx != 0)
4928     return UnableToLegalize;
4929 
4930   Register DstReg = MI.getOperand(0).getReg();
4931   Register Src1Reg = MI.getOperand(1).getReg();
4932   Register Src2Reg = MI.getOperand(2).getReg();
4933   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4934   LLT DstTy = MRI.getType(DstReg);
4935   LLT Src1Ty = MRI.getType(Src1Reg);
4936   LLT Src2Ty = MRI.getType(Src2Reg);
4937   unsigned NumElts = DstTy.getNumElements();
4938   unsigned WidenNumElts = MoreTy.getNumElements();
4939 
4940   // Expect a canonicalized shuffle.
4941   if (DstTy != Src1Ty || DstTy != Src2Ty)
4942     return UnableToLegalize;
4943 
4944   moreElementsVectorSrc(MI, MoreTy, 1);
4945   moreElementsVectorSrc(MI, MoreTy, 2);
4946 
4947   // Adjust mask based on new input vector length.
4948   SmallVector<int, 16> NewMask;
4949   for (unsigned I = 0; I != NumElts; ++I) {
4950     int Idx = Mask[I];
4951     if (Idx < static_cast<int>(NumElts))
4952       NewMask.push_back(Idx);
4953     else
4954       NewMask.push_back(Idx - NumElts + WidenNumElts);
4955   }
4956   for (unsigned I = NumElts; I != WidenNumElts; ++I)
4957     NewMask.push_back(-1);
4958   moreElementsVectorDst(MI, MoreTy, 0);
4959   MIRBuilder.setInstrAndDebugLoc(MI);
4960   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
4961                                 MI.getOperand(1).getReg(),
4962                                 MI.getOperand(2).getReg(), NewMask);
4963   MI.eraseFromParent();
4964   return Legalized;
4965 }
4966 
4967 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
4968                                         ArrayRef<Register> Src1Regs,
4969                                         ArrayRef<Register> Src2Regs,
4970                                         LLT NarrowTy) {
4971   MachineIRBuilder &B = MIRBuilder;
4972   unsigned SrcParts = Src1Regs.size();
4973   unsigned DstParts = DstRegs.size();
4974 
4975   unsigned DstIdx = 0; // Low bits of the result.
4976   Register FactorSum =
4977       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
4978   DstRegs[DstIdx] = FactorSum;
4979 
4980   unsigned CarrySumPrevDstIdx;
4981   SmallVector<Register, 4> Factors;
4982 
4983   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
4984     // Collect low parts of muls for DstIdx.
4985     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
4986          i <= std::min(DstIdx, SrcParts - 1); ++i) {
4987       MachineInstrBuilder Mul =
4988           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
4989       Factors.push_back(Mul.getReg(0));
4990     }
4991     // Collect high parts of muls from previous DstIdx.
4992     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
4993          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
4994       MachineInstrBuilder Umulh =
4995           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
4996       Factors.push_back(Umulh.getReg(0));
4997     }
4998     // Add CarrySum from additions calculated for previous DstIdx.
4999     if (DstIdx != 1) {
5000       Factors.push_back(CarrySumPrevDstIdx);
5001     }
5002 
5003     Register CarrySum;
5004     // Add all factors and accumulate all carries into CarrySum.
5005     if (DstIdx != DstParts - 1) {
5006       MachineInstrBuilder Uaddo =
5007           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5008       FactorSum = Uaddo.getReg(0);
5009       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5010       for (unsigned i = 2; i < Factors.size(); ++i) {
5011         MachineInstrBuilder Uaddo =
5012             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5013         FactorSum = Uaddo.getReg(0);
5014         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5015         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5016       }
5017     } else {
5018       // Since value for the next index is not calculated, neither is CarrySum.
5019       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5020       for (unsigned i = 2; i < Factors.size(); ++i)
5021         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5022     }
5023 
5024     CarrySumPrevDstIdx = CarrySum;
5025     DstRegs[DstIdx] = FactorSum;
5026     Factors.clear();
5027   }
5028 }
5029 
5030 LegalizerHelper::LegalizeResult
5031 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5032                                     LLT NarrowTy) {
5033   if (TypeIdx != 0)
5034     return UnableToLegalize;
5035 
5036   Register DstReg = MI.getOperand(0).getReg();
5037   LLT DstType = MRI.getType(DstReg);
5038   // FIXME: add support for vector types
5039   if (DstType.isVector())
5040     return UnableToLegalize;
5041 
5042   unsigned Opcode = MI.getOpcode();
5043   unsigned OpO, OpE, OpF;
5044   switch (Opcode) {
5045   case TargetOpcode::G_SADDO:
5046   case TargetOpcode::G_SADDE:
5047   case TargetOpcode::G_UADDO:
5048   case TargetOpcode::G_UADDE:
5049   case TargetOpcode::G_ADD:
5050     OpO = TargetOpcode::G_UADDO;
5051     OpE = TargetOpcode::G_UADDE;
5052     OpF = TargetOpcode::G_UADDE;
5053     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5054       OpF = TargetOpcode::G_SADDE;
5055     break;
5056   case TargetOpcode::G_SSUBO:
5057   case TargetOpcode::G_SSUBE:
5058   case TargetOpcode::G_USUBO:
5059   case TargetOpcode::G_USUBE:
5060   case TargetOpcode::G_SUB:
5061     OpO = TargetOpcode::G_USUBO;
5062     OpE = TargetOpcode::G_USUBE;
5063     OpF = TargetOpcode::G_USUBE;
5064     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5065       OpF = TargetOpcode::G_SSUBE;
5066     break;
5067   default:
5068     llvm_unreachable("Unexpected add/sub opcode!");
5069   }
5070 
5071   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5072   unsigned NumDefs = MI.getNumExplicitDefs();
5073   Register Src1 = MI.getOperand(NumDefs).getReg();
5074   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5075   Register CarryDst, CarryIn;
5076   if (NumDefs == 2)
5077     CarryDst = MI.getOperand(1).getReg();
5078   if (MI.getNumOperands() == NumDefs + 3)
5079     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5080 
5081   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5082   LLT LeftoverTy, DummyTy;
5083   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5084   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5085   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5086 
5087   int NarrowParts = Src1Regs.size();
5088   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5089     Src1Regs.push_back(Src1Left[I]);
5090     Src2Regs.push_back(Src2Left[I]);
5091   }
5092   DstRegs.reserve(Src1Regs.size());
5093 
5094   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5095     Register DstReg =
5096         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5097     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5098     // Forward the final carry-out to the destination register
5099     if (i == e - 1 && CarryDst)
5100       CarryOut = CarryDst;
5101 
5102     if (!CarryIn) {
5103       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5104                             {Src1Regs[i], Src2Regs[i]});
5105     } else if (i == e - 1) {
5106       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5107                             {Src1Regs[i], Src2Regs[i], CarryIn});
5108     } else {
5109       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5110                             {Src1Regs[i], Src2Regs[i], CarryIn});
5111     }
5112 
5113     DstRegs.push_back(DstReg);
5114     CarryIn = CarryOut;
5115   }
5116   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5117               makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5118               makeArrayRef(DstRegs).drop_front(NarrowParts));
5119 
5120   MI.eraseFromParent();
5121   return Legalized;
5122 }
5123 
5124 LegalizerHelper::LegalizeResult
5125 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5126   Register DstReg = MI.getOperand(0).getReg();
5127   Register Src1 = MI.getOperand(1).getReg();
5128   Register Src2 = MI.getOperand(2).getReg();
5129 
5130   LLT Ty = MRI.getType(DstReg);
5131   if (Ty.isVector())
5132     return UnableToLegalize;
5133 
5134   unsigned SrcSize = MRI.getType(Src1).getSizeInBits();
5135   unsigned DstSize = Ty.getSizeInBits();
5136   unsigned NarrowSize = NarrowTy.getSizeInBits();
5137   if (DstSize % NarrowSize != 0 || SrcSize % NarrowSize != 0)
5138     return UnableToLegalize;
5139 
5140   unsigned NumDstParts = DstSize / NarrowSize;
5141   unsigned NumSrcParts = SrcSize / NarrowSize;
5142   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5143   unsigned DstTmpParts = NumDstParts * (IsMulHigh ? 2 : 1);
5144 
5145   SmallVector<Register, 2> Src1Parts, Src2Parts;
5146   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5147   extractParts(Src1, NarrowTy, NumSrcParts, Src1Parts);
5148   extractParts(Src2, NarrowTy, NumSrcParts, Src2Parts);
5149   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5150 
5151   // Take only high half of registers if this is high mul.
5152   ArrayRef<Register> DstRegs(
5153       IsMulHigh ? &DstTmpRegs[DstTmpParts / 2] : &DstTmpRegs[0], NumDstParts);
5154   MIRBuilder.buildMerge(DstReg, DstRegs);
5155   MI.eraseFromParent();
5156   return Legalized;
5157 }
5158 
5159 LegalizerHelper::LegalizeResult
5160 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5161                                    LLT NarrowTy) {
5162   if (TypeIdx != 0)
5163     return UnableToLegalize;
5164 
5165   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5166 
5167   Register Src = MI.getOperand(1).getReg();
5168   LLT SrcTy = MRI.getType(Src);
5169 
5170   // If all finite floats fit into the narrowed integer type, we can just swap
5171   // out the result type. This is practically only useful for conversions from
5172   // half to at least 16-bits, so just handle the one case.
5173   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5174       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5175     return UnableToLegalize;
5176 
5177   Observer.changingInstr(MI);
5178   narrowScalarDst(MI, NarrowTy, 0,
5179                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5180   Observer.changedInstr(MI);
5181   return Legalized;
5182 }
5183 
5184 LegalizerHelper::LegalizeResult
5185 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5186                                      LLT NarrowTy) {
5187   if (TypeIdx != 1)
5188     return UnableToLegalize;
5189 
5190   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5191 
5192   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5193   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5194   // NarrowSize.
5195   if (SizeOp1 % NarrowSize != 0)
5196     return UnableToLegalize;
5197   int NumParts = SizeOp1 / NarrowSize;
5198 
5199   SmallVector<Register, 2> SrcRegs, DstRegs;
5200   SmallVector<uint64_t, 2> Indexes;
5201   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5202 
5203   Register OpReg = MI.getOperand(0).getReg();
5204   uint64_t OpStart = MI.getOperand(2).getImm();
5205   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5206   for (int i = 0; i < NumParts; ++i) {
5207     unsigned SrcStart = i * NarrowSize;
5208 
5209     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5210       // No part of the extract uses this subregister, ignore it.
5211       continue;
5212     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5213       // The entire subregister is extracted, forward the value.
5214       DstRegs.push_back(SrcRegs[i]);
5215       continue;
5216     }
5217 
5218     // OpSegStart is where this destination segment would start in OpReg if it
5219     // extended infinitely in both directions.
5220     int64_t ExtractOffset;
5221     uint64_t SegSize;
5222     if (OpStart < SrcStart) {
5223       ExtractOffset = 0;
5224       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5225     } else {
5226       ExtractOffset = OpStart - SrcStart;
5227       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5228     }
5229 
5230     Register SegReg = SrcRegs[i];
5231     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5232       // A genuine extract is needed.
5233       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5234       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5235     }
5236 
5237     DstRegs.push_back(SegReg);
5238   }
5239 
5240   Register DstReg = MI.getOperand(0).getReg();
5241   if (MRI.getType(DstReg).isVector())
5242     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5243   else if (DstRegs.size() > 1)
5244     MIRBuilder.buildMerge(DstReg, DstRegs);
5245   else
5246     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5247   MI.eraseFromParent();
5248   return Legalized;
5249 }
5250 
5251 LegalizerHelper::LegalizeResult
5252 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5253                                     LLT NarrowTy) {
5254   // FIXME: Don't know how to handle secondary types yet.
5255   if (TypeIdx != 0)
5256     return UnableToLegalize;
5257 
5258   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5259   SmallVector<uint64_t, 2> Indexes;
5260   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5261   LLT LeftoverTy;
5262   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5263                LeftoverRegs);
5264 
5265   for (Register Reg : LeftoverRegs)
5266     SrcRegs.push_back(Reg);
5267 
5268   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5269   Register OpReg = MI.getOperand(2).getReg();
5270   uint64_t OpStart = MI.getOperand(3).getImm();
5271   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5272   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5273     unsigned DstStart = I * NarrowSize;
5274 
5275     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5276       // The entire subregister is defined by this insert, forward the new
5277       // value.
5278       DstRegs.push_back(OpReg);
5279       continue;
5280     }
5281 
5282     Register SrcReg = SrcRegs[I];
5283     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5284       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5285       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5286       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5287     }
5288 
5289     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5290       // No part of the insert affects this subregister, forward the original.
5291       DstRegs.push_back(SrcReg);
5292       continue;
5293     }
5294 
5295     // OpSegStart is where this destination segment would start in OpReg if it
5296     // extended infinitely in both directions.
5297     int64_t ExtractOffset, InsertOffset;
5298     uint64_t SegSize;
5299     if (OpStart < DstStart) {
5300       InsertOffset = 0;
5301       ExtractOffset = DstStart - OpStart;
5302       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5303     } else {
5304       InsertOffset = OpStart - DstStart;
5305       ExtractOffset = 0;
5306       SegSize =
5307         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5308     }
5309 
5310     Register SegReg = OpReg;
5311     if (ExtractOffset != 0 || SegSize != OpSize) {
5312       // A genuine extract is needed.
5313       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5314       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5315     }
5316 
5317     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5318     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5319     DstRegs.push_back(DstReg);
5320   }
5321 
5322   uint64_t WideSize = DstRegs.size() * NarrowSize;
5323   Register DstReg = MI.getOperand(0).getReg();
5324   if (WideSize > RegTy.getSizeInBits()) {
5325     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5326     MIRBuilder.buildMerge(MergeReg, DstRegs);
5327     MIRBuilder.buildTrunc(DstReg, MergeReg);
5328   } else
5329     MIRBuilder.buildMerge(DstReg, DstRegs);
5330 
5331   MI.eraseFromParent();
5332   return Legalized;
5333 }
5334 
5335 LegalizerHelper::LegalizeResult
5336 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5337                                    LLT NarrowTy) {
5338   Register DstReg = MI.getOperand(0).getReg();
5339   LLT DstTy = MRI.getType(DstReg);
5340 
5341   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5342 
5343   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5344   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5345   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5346   LLT LeftoverTy;
5347   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5348                     Src0Regs, Src0LeftoverRegs))
5349     return UnableToLegalize;
5350 
5351   LLT Unused;
5352   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5353                     Src1Regs, Src1LeftoverRegs))
5354     llvm_unreachable("inconsistent extractParts result");
5355 
5356   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5357     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5358                                         {Src0Regs[I], Src1Regs[I]});
5359     DstRegs.push_back(Inst.getReg(0));
5360   }
5361 
5362   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5363     auto Inst = MIRBuilder.buildInstr(
5364       MI.getOpcode(),
5365       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5366     DstLeftoverRegs.push_back(Inst.getReg(0));
5367   }
5368 
5369   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5370               LeftoverTy, DstLeftoverRegs);
5371 
5372   MI.eraseFromParent();
5373   return Legalized;
5374 }
5375 
5376 LegalizerHelper::LegalizeResult
5377 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5378                                  LLT NarrowTy) {
5379   if (TypeIdx != 0)
5380     return UnableToLegalize;
5381 
5382   Register DstReg = MI.getOperand(0).getReg();
5383   Register SrcReg = MI.getOperand(1).getReg();
5384 
5385   LLT DstTy = MRI.getType(DstReg);
5386   if (DstTy.isVector())
5387     return UnableToLegalize;
5388 
5389   SmallVector<Register, 8> Parts;
5390   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5391   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5392   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5393 
5394   MI.eraseFromParent();
5395   return Legalized;
5396 }
5397 
5398 LegalizerHelper::LegalizeResult
5399 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5400                                     LLT NarrowTy) {
5401   if (TypeIdx != 0)
5402     return UnableToLegalize;
5403 
5404   Register CondReg = MI.getOperand(1).getReg();
5405   LLT CondTy = MRI.getType(CondReg);
5406   if (CondTy.isVector()) // TODO: Handle vselect
5407     return UnableToLegalize;
5408 
5409   Register DstReg = MI.getOperand(0).getReg();
5410   LLT DstTy = MRI.getType(DstReg);
5411 
5412   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5413   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5414   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5415   LLT LeftoverTy;
5416   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5417                     Src1Regs, Src1LeftoverRegs))
5418     return UnableToLegalize;
5419 
5420   LLT Unused;
5421   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5422                     Src2Regs, Src2LeftoverRegs))
5423     llvm_unreachable("inconsistent extractParts result");
5424 
5425   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5426     auto Select = MIRBuilder.buildSelect(NarrowTy,
5427                                          CondReg, Src1Regs[I], Src2Regs[I]);
5428     DstRegs.push_back(Select.getReg(0));
5429   }
5430 
5431   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5432     auto Select = MIRBuilder.buildSelect(
5433       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5434     DstLeftoverRegs.push_back(Select.getReg(0));
5435   }
5436 
5437   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5438               LeftoverTy, DstLeftoverRegs);
5439 
5440   MI.eraseFromParent();
5441   return Legalized;
5442 }
5443 
5444 LegalizerHelper::LegalizeResult
5445 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5446                                   LLT NarrowTy) {
5447   if (TypeIdx != 1)
5448     return UnableToLegalize;
5449 
5450   Register DstReg = MI.getOperand(0).getReg();
5451   Register SrcReg = MI.getOperand(1).getReg();
5452   LLT DstTy = MRI.getType(DstReg);
5453   LLT SrcTy = MRI.getType(SrcReg);
5454   unsigned NarrowSize = NarrowTy.getSizeInBits();
5455 
5456   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5457     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5458 
5459     MachineIRBuilder &B = MIRBuilder;
5460     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5461     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5462     auto C_0 = B.buildConstant(NarrowTy, 0);
5463     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5464                                 UnmergeSrc.getReg(1), C_0);
5465     auto LoCTLZ = IsUndef ?
5466       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5467       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5468     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5469     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5470     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5471     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5472 
5473     MI.eraseFromParent();
5474     return Legalized;
5475   }
5476 
5477   return UnableToLegalize;
5478 }
5479 
5480 LegalizerHelper::LegalizeResult
5481 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5482                                   LLT NarrowTy) {
5483   if (TypeIdx != 1)
5484     return UnableToLegalize;
5485 
5486   Register DstReg = MI.getOperand(0).getReg();
5487   Register SrcReg = MI.getOperand(1).getReg();
5488   LLT DstTy = MRI.getType(DstReg);
5489   LLT SrcTy = MRI.getType(SrcReg);
5490   unsigned NarrowSize = NarrowTy.getSizeInBits();
5491 
5492   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5493     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5494 
5495     MachineIRBuilder &B = MIRBuilder;
5496     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5497     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5498     auto C_0 = B.buildConstant(NarrowTy, 0);
5499     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5500                                 UnmergeSrc.getReg(0), C_0);
5501     auto HiCTTZ = IsUndef ?
5502       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5503       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5504     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5505     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5506     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5507     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5508 
5509     MI.eraseFromParent();
5510     return Legalized;
5511   }
5512 
5513   return UnableToLegalize;
5514 }
5515 
5516 LegalizerHelper::LegalizeResult
5517 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5518                                    LLT NarrowTy) {
5519   if (TypeIdx != 1)
5520     return UnableToLegalize;
5521 
5522   Register DstReg = MI.getOperand(0).getReg();
5523   LLT DstTy = MRI.getType(DstReg);
5524   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5525   unsigned NarrowSize = NarrowTy.getSizeInBits();
5526 
5527   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5528     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5529 
5530     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5531     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5532     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5533 
5534     MI.eraseFromParent();
5535     return Legalized;
5536   }
5537 
5538   return UnableToLegalize;
5539 }
5540 
5541 LegalizerHelper::LegalizeResult
5542 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5543   unsigned Opc = MI.getOpcode();
5544   const auto &TII = MIRBuilder.getTII();
5545   auto isSupported = [this](const LegalityQuery &Q) {
5546     auto QAction = LI.getAction(Q).Action;
5547     return QAction == Legal || QAction == Libcall || QAction == Custom;
5548   };
5549   switch (Opc) {
5550   default:
5551     return UnableToLegalize;
5552   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5553     // This trivially expands to CTLZ.
5554     Observer.changingInstr(MI);
5555     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5556     Observer.changedInstr(MI);
5557     return Legalized;
5558   }
5559   case TargetOpcode::G_CTLZ: {
5560     Register DstReg = MI.getOperand(0).getReg();
5561     Register SrcReg = MI.getOperand(1).getReg();
5562     LLT DstTy = MRI.getType(DstReg);
5563     LLT SrcTy = MRI.getType(SrcReg);
5564     unsigned Len = SrcTy.getSizeInBits();
5565 
5566     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5567       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5568       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5569       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5570       auto ICmp = MIRBuilder.buildICmp(
5571           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5572       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5573       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5574       MI.eraseFromParent();
5575       return Legalized;
5576     }
5577     // for now, we do this:
5578     // NewLen = NextPowerOf2(Len);
5579     // x = x | (x >> 1);
5580     // x = x | (x >> 2);
5581     // ...
5582     // x = x | (x >>16);
5583     // x = x | (x >>32); // for 64-bit input
5584     // Upto NewLen/2
5585     // return Len - popcount(x);
5586     //
5587     // Ref: "Hacker's Delight" by Henry Warren
5588     Register Op = SrcReg;
5589     unsigned NewLen = PowerOf2Ceil(Len);
5590     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5591       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5592       auto MIBOp = MIRBuilder.buildOr(
5593           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5594       Op = MIBOp.getReg(0);
5595     }
5596     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5597     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5598                         MIBPop);
5599     MI.eraseFromParent();
5600     return Legalized;
5601   }
5602   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5603     // This trivially expands to CTTZ.
5604     Observer.changingInstr(MI);
5605     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5606     Observer.changedInstr(MI);
5607     return Legalized;
5608   }
5609   case TargetOpcode::G_CTTZ: {
5610     Register DstReg = MI.getOperand(0).getReg();
5611     Register SrcReg = MI.getOperand(1).getReg();
5612     LLT DstTy = MRI.getType(DstReg);
5613     LLT SrcTy = MRI.getType(SrcReg);
5614 
5615     unsigned Len = SrcTy.getSizeInBits();
5616     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5617       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5618       // zero.
5619       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5620       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5621       auto ICmp = MIRBuilder.buildICmp(
5622           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5623       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5624       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5625       MI.eraseFromParent();
5626       return Legalized;
5627     }
5628     // for now, we use: { return popcount(~x & (x - 1)); }
5629     // unless the target has ctlz but not ctpop, in which case we use:
5630     // { return 32 - nlz(~x & (x-1)); }
5631     // Ref: "Hacker's Delight" by Henry Warren
5632     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5633     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5634     auto MIBTmp = MIRBuilder.buildAnd(
5635         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5636     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5637         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5638       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5639       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5640                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5641       MI.eraseFromParent();
5642       return Legalized;
5643     }
5644     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5645     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5646     return Legalized;
5647   }
5648   case TargetOpcode::G_CTPOP: {
5649     Register SrcReg = MI.getOperand(1).getReg();
5650     LLT Ty = MRI.getType(SrcReg);
5651     unsigned Size = Ty.getSizeInBits();
5652     MachineIRBuilder &B = MIRBuilder;
5653 
5654     // Count set bits in blocks of 2 bits. Default approach would be
5655     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5656     // We use following formula instead:
5657     // B2Count = val - { (val >> 1) & 0x55555555 }
5658     // since it gives same result in blocks of 2 with one instruction less.
5659     auto C_1 = B.buildConstant(Ty, 1);
5660     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5661     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5662     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5663     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5664     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5665 
5666     // In order to get count in blocks of 4 add values from adjacent block of 2.
5667     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5668     auto C_2 = B.buildConstant(Ty, 2);
5669     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5670     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5671     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5672     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5673     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5674     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5675 
5676     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5677     // addition since count value sits in range {0,...,8} and 4 bits are enough
5678     // to hold such binary values. After addition high 4 bits still hold count
5679     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5680     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5681     auto C_4 = B.buildConstant(Ty, 4);
5682     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5683     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5684     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5685     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5686     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5687 
5688     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5689     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5690     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5691     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5692     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5693 
5694     // Shift count result from 8 high bits to low bits.
5695     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5696     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5697 
5698     MI.eraseFromParent();
5699     return Legalized;
5700   }
5701   }
5702 }
5703 
5704 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5705 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5706                                         Register Reg, unsigned BW) {
5707   return matchUnaryPredicate(
5708       MRI, Reg,
5709       [=](const Constant *C) {
5710         // Null constant here means an undef.
5711         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5712         return !CI || CI->getValue().urem(BW) != 0;
5713       },
5714       /*AllowUndefs*/ true);
5715 }
5716 
5717 LegalizerHelper::LegalizeResult
5718 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5719   Register Dst = MI.getOperand(0).getReg();
5720   Register X = MI.getOperand(1).getReg();
5721   Register Y = MI.getOperand(2).getReg();
5722   Register Z = MI.getOperand(3).getReg();
5723   LLT Ty = MRI.getType(Dst);
5724   LLT ShTy = MRI.getType(Z);
5725 
5726   unsigned BW = Ty.getScalarSizeInBits();
5727 
5728   if (!isPowerOf2_32(BW))
5729     return UnableToLegalize;
5730 
5731   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5732   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5733 
5734   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5735     // fshl X, Y, Z -> fshr X, Y, -Z
5736     // fshr X, Y, Z -> fshl X, Y, -Z
5737     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5738     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5739   } else {
5740     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5741     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5742     auto One = MIRBuilder.buildConstant(ShTy, 1);
5743     if (IsFSHL) {
5744       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5745       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5746     } else {
5747       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5748       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5749     }
5750 
5751     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5752   }
5753 
5754   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5755   MI.eraseFromParent();
5756   return Legalized;
5757 }
5758 
5759 LegalizerHelper::LegalizeResult
5760 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5761   Register Dst = MI.getOperand(0).getReg();
5762   Register X = MI.getOperand(1).getReg();
5763   Register Y = MI.getOperand(2).getReg();
5764   Register Z = MI.getOperand(3).getReg();
5765   LLT Ty = MRI.getType(Dst);
5766   LLT ShTy = MRI.getType(Z);
5767 
5768   const unsigned BW = Ty.getScalarSizeInBits();
5769   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5770 
5771   Register ShX, ShY;
5772   Register ShAmt, InvShAmt;
5773 
5774   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5775   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5776     // fshl: X << C | Y >> (BW - C)
5777     // fshr: X << (BW - C) | Y >> C
5778     // where C = Z % BW is not zero
5779     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5780     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5781     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5782     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5783     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5784   } else {
5785     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5786     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5787     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5788     if (isPowerOf2_32(BW)) {
5789       // Z % BW -> Z & (BW - 1)
5790       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5791       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5792       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5793       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
5794     } else {
5795       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5796       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5797       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
5798     }
5799 
5800     auto One = MIRBuilder.buildConstant(ShTy, 1);
5801     if (IsFSHL) {
5802       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
5803       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
5804       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
5805     } else {
5806       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
5807       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
5808       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
5809     }
5810   }
5811 
5812   MIRBuilder.buildOr(Dst, ShX, ShY);
5813   MI.eraseFromParent();
5814   return Legalized;
5815 }
5816 
5817 LegalizerHelper::LegalizeResult
5818 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
5819   // These operations approximately do the following (while avoiding undefined
5820   // shifts by BW):
5821   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
5822   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
5823   Register Dst = MI.getOperand(0).getReg();
5824   LLT Ty = MRI.getType(Dst);
5825   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
5826 
5827   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5828   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5829 
5830   // TODO: Use smarter heuristic that accounts for vector legalization.
5831   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
5832     return lowerFunnelShiftAsShifts(MI);
5833 
5834   // This only works for powers of 2, fallback to shifts if it fails.
5835   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
5836   if (Result == UnableToLegalize)
5837     return lowerFunnelShiftAsShifts(MI);
5838   return Result;
5839 }
5840 
5841 LegalizerHelper::LegalizeResult
5842 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
5843   Register Dst = MI.getOperand(0).getReg();
5844   Register Src = MI.getOperand(1).getReg();
5845   Register Amt = MI.getOperand(2).getReg();
5846   LLT AmtTy = MRI.getType(Amt);
5847   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5848   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5849   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5850   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5851   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
5852   MI.eraseFromParent();
5853   return Legalized;
5854 }
5855 
5856 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
5857   Register Dst = MI.getOperand(0).getReg();
5858   Register Src = MI.getOperand(1).getReg();
5859   Register Amt = MI.getOperand(2).getReg();
5860   LLT DstTy = MRI.getType(Dst);
5861   LLT SrcTy = MRI.getType(Dst);
5862   LLT AmtTy = MRI.getType(Amt);
5863 
5864   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
5865   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5866 
5867   MIRBuilder.setInstrAndDebugLoc(MI);
5868 
5869   // If a rotate in the other direction is supported, use it.
5870   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5871   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
5872       isPowerOf2_32(EltSizeInBits))
5873     return lowerRotateWithReverseRotate(MI);
5874 
5875   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5876   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
5877   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
5878   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
5879   Register ShVal;
5880   Register RevShiftVal;
5881   if (isPowerOf2_32(EltSizeInBits)) {
5882     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
5883     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
5884     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5885     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
5886     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5887     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
5888     RevShiftVal =
5889         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
5890   } else {
5891     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
5892     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
5893     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
5894     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
5895     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5896     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
5897     auto One = MIRBuilder.buildConstant(AmtTy, 1);
5898     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
5899     RevShiftVal =
5900         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
5901   }
5902   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
5903   MI.eraseFromParent();
5904   return Legalized;
5905 }
5906 
5907 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
5908 // representation.
5909 LegalizerHelper::LegalizeResult
5910 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
5911   Register Dst = MI.getOperand(0).getReg();
5912   Register Src = MI.getOperand(1).getReg();
5913   const LLT S64 = LLT::scalar(64);
5914   const LLT S32 = LLT::scalar(32);
5915   const LLT S1 = LLT::scalar(1);
5916 
5917   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
5918 
5919   // unsigned cul2f(ulong u) {
5920   //   uint lz = clz(u);
5921   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
5922   //   u = (u << lz) & 0x7fffffffffffffffUL;
5923   //   ulong t = u & 0xffffffffffUL;
5924   //   uint v = (e << 23) | (uint)(u >> 40);
5925   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
5926   //   return as_float(v + r);
5927   // }
5928 
5929   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
5930   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
5931 
5932   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
5933 
5934   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
5935   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
5936 
5937   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
5938   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
5939 
5940   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
5941   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
5942 
5943   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
5944 
5945   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
5946   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
5947 
5948   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
5949   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
5950   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
5951 
5952   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
5953   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
5954   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
5955   auto One = MIRBuilder.buildConstant(S32, 1);
5956 
5957   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
5958   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
5959   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
5960   MIRBuilder.buildAdd(Dst, V, R);
5961 
5962   MI.eraseFromParent();
5963   return Legalized;
5964 }
5965 
5966 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
5967   Register Dst = MI.getOperand(0).getReg();
5968   Register Src = MI.getOperand(1).getReg();
5969   LLT DstTy = MRI.getType(Dst);
5970   LLT SrcTy = MRI.getType(Src);
5971 
5972   if (SrcTy == LLT::scalar(1)) {
5973     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
5974     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
5975     MIRBuilder.buildSelect(Dst, Src, True, False);
5976     MI.eraseFromParent();
5977     return Legalized;
5978   }
5979 
5980   if (SrcTy != LLT::scalar(64))
5981     return UnableToLegalize;
5982 
5983   if (DstTy == LLT::scalar(32)) {
5984     // TODO: SelectionDAG has several alternative expansions to port which may
5985     // be more reasonble depending on the available instructions. If a target
5986     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
5987     // intermediate type, this is probably worse.
5988     return lowerU64ToF32BitOps(MI);
5989   }
5990 
5991   return UnableToLegalize;
5992 }
5993 
5994 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
5995   Register Dst = MI.getOperand(0).getReg();
5996   Register Src = MI.getOperand(1).getReg();
5997   LLT DstTy = MRI.getType(Dst);
5998   LLT SrcTy = MRI.getType(Src);
5999 
6000   const LLT S64 = LLT::scalar(64);
6001   const LLT S32 = LLT::scalar(32);
6002   const LLT S1 = LLT::scalar(1);
6003 
6004   if (SrcTy == S1) {
6005     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6006     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6007     MIRBuilder.buildSelect(Dst, Src, True, False);
6008     MI.eraseFromParent();
6009     return Legalized;
6010   }
6011 
6012   if (SrcTy != S64)
6013     return UnableToLegalize;
6014 
6015   if (DstTy == S32) {
6016     // signed cl2f(long l) {
6017     //   long s = l >> 63;
6018     //   float r = cul2f((l + s) ^ s);
6019     //   return s ? -r : r;
6020     // }
6021     Register L = Src;
6022     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6023     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6024 
6025     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6026     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6027     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6028 
6029     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6030     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6031                                             MIRBuilder.buildConstant(S64, 0));
6032     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6033     MI.eraseFromParent();
6034     return Legalized;
6035   }
6036 
6037   return UnableToLegalize;
6038 }
6039 
6040 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6041   Register Dst = MI.getOperand(0).getReg();
6042   Register Src = MI.getOperand(1).getReg();
6043   LLT DstTy = MRI.getType(Dst);
6044   LLT SrcTy = MRI.getType(Src);
6045   const LLT S64 = LLT::scalar(64);
6046   const LLT S32 = LLT::scalar(32);
6047 
6048   if (SrcTy != S64 && SrcTy != S32)
6049     return UnableToLegalize;
6050   if (DstTy != S32 && DstTy != S64)
6051     return UnableToLegalize;
6052 
6053   // FPTOSI gives same result as FPTOUI for positive signed integers.
6054   // FPTOUI needs to deal with fp values that convert to unsigned integers
6055   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6056 
6057   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6058   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6059                                                 : APFloat::IEEEdouble(),
6060                     APInt::getNullValue(SrcTy.getSizeInBits()));
6061   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6062 
6063   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6064 
6065   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6066   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6067   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6068   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6069   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6070   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6071   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6072 
6073   const LLT S1 = LLT::scalar(1);
6074 
6075   MachineInstrBuilder FCMP =
6076       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6077   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6078 
6079   MI.eraseFromParent();
6080   return Legalized;
6081 }
6082 
6083 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6084   Register Dst = MI.getOperand(0).getReg();
6085   Register Src = MI.getOperand(1).getReg();
6086   LLT DstTy = MRI.getType(Dst);
6087   LLT SrcTy = MRI.getType(Src);
6088   const LLT S64 = LLT::scalar(64);
6089   const LLT S32 = LLT::scalar(32);
6090 
6091   // FIXME: Only f32 to i64 conversions are supported.
6092   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6093     return UnableToLegalize;
6094 
6095   // Expand f32 -> i64 conversion
6096   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6097   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6098 
6099   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6100 
6101   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6102   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6103 
6104   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6105   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6106 
6107   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6108                                            APInt::getSignMask(SrcEltBits));
6109   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6110   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6111   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6112   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6113 
6114   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6115   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6116   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6117 
6118   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6119   R = MIRBuilder.buildZExt(DstTy, R);
6120 
6121   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6122   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6123   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6124   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6125 
6126   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6127   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6128 
6129   const LLT S1 = LLT::scalar(1);
6130   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6131                                     S1, Exponent, ExponentLoBit);
6132 
6133   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6134 
6135   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6136   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6137 
6138   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6139 
6140   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6141                                           S1, Exponent, ZeroSrcTy);
6142 
6143   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6144   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6145 
6146   MI.eraseFromParent();
6147   return Legalized;
6148 }
6149 
6150 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6151 LegalizerHelper::LegalizeResult
6152 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6153   Register Dst = MI.getOperand(0).getReg();
6154   Register Src = MI.getOperand(1).getReg();
6155 
6156   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6157     return UnableToLegalize;
6158 
6159   const unsigned ExpMask = 0x7ff;
6160   const unsigned ExpBiasf64 = 1023;
6161   const unsigned ExpBiasf16 = 15;
6162   const LLT S32 = LLT::scalar(32);
6163   const LLT S1 = LLT::scalar(1);
6164 
6165   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6166   Register U = Unmerge.getReg(0);
6167   Register UH = Unmerge.getReg(1);
6168 
6169   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6170   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6171 
6172   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6173   // add the f16 bias (15) to get the biased exponent for the f16 format.
6174   E = MIRBuilder.buildAdd(
6175     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6176 
6177   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6178   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6179 
6180   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6181                                        MIRBuilder.buildConstant(S32, 0x1ff));
6182   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6183 
6184   auto Zero = MIRBuilder.buildConstant(S32, 0);
6185   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6186   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6187   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6188 
6189   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6190   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6191   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6192   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6193 
6194   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6195   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6196 
6197   // N = M | (E << 12);
6198   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6199   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6200 
6201   // B = clamp(1-E, 0, 13);
6202   auto One = MIRBuilder.buildConstant(S32, 1);
6203   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6204   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6205   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6206 
6207   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6208                                        MIRBuilder.buildConstant(S32, 0x1000));
6209 
6210   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6211   auto D0 = MIRBuilder.buildShl(S32, D, B);
6212 
6213   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6214                                              D0, SigSetHigh);
6215   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6216   D = MIRBuilder.buildOr(S32, D, D1);
6217 
6218   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6219   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6220 
6221   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6222   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6223 
6224   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6225                                        MIRBuilder.buildConstant(S32, 3));
6226   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6227 
6228   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6229                                        MIRBuilder.buildConstant(S32, 5));
6230   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6231 
6232   V1 = MIRBuilder.buildOr(S32, V0, V1);
6233   V = MIRBuilder.buildAdd(S32, V, V1);
6234 
6235   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6236                                        E, MIRBuilder.buildConstant(S32, 30));
6237   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6238                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6239 
6240   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6241                                          E, MIRBuilder.buildConstant(S32, 1039));
6242   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6243 
6244   // Extract the sign bit.
6245   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6246   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6247 
6248   // Insert the sign bit
6249   V = MIRBuilder.buildOr(S32, Sign, V);
6250 
6251   MIRBuilder.buildTrunc(Dst, V);
6252   MI.eraseFromParent();
6253   return Legalized;
6254 }
6255 
6256 LegalizerHelper::LegalizeResult
6257 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6258   Register Dst = MI.getOperand(0).getReg();
6259   Register Src = MI.getOperand(1).getReg();
6260 
6261   LLT DstTy = MRI.getType(Dst);
6262   LLT SrcTy = MRI.getType(Src);
6263   const LLT S64 = LLT::scalar(64);
6264   const LLT S16 = LLT::scalar(16);
6265 
6266   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6267     return lowerFPTRUNC_F64_TO_F16(MI);
6268 
6269   return UnableToLegalize;
6270 }
6271 
6272 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6273 // multiplication tree.
6274 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6275   Register Dst = MI.getOperand(0).getReg();
6276   Register Src0 = MI.getOperand(1).getReg();
6277   Register Src1 = MI.getOperand(2).getReg();
6278   LLT Ty = MRI.getType(Dst);
6279 
6280   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6281   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6282   MI.eraseFromParent();
6283   return Legalized;
6284 }
6285 
6286 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6287   switch (Opc) {
6288   case TargetOpcode::G_SMIN:
6289     return CmpInst::ICMP_SLT;
6290   case TargetOpcode::G_SMAX:
6291     return CmpInst::ICMP_SGT;
6292   case TargetOpcode::G_UMIN:
6293     return CmpInst::ICMP_ULT;
6294   case TargetOpcode::G_UMAX:
6295     return CmpInst::ICMP_UGT;
6296   default:
6297     llvm_unreachable("not in integer min/max");
6298   }
6299 }
6300 
6301 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6302   Register Dst = MI.getOperand(0).getReg();
6303   Register Src0 = MI.getOperand(1).getReg();
6304   Register Src1 = MI.getOperand(2).getReg();
6305 
6306   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6307   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6308 
6309   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6310   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6311 
6312   MI.eraseFromParent();
6313   return Legalized;
6314 }
6315 
6316 LegalizerHelper::LegalizeResult
6317 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6318   Register Dst = MI.getOperand(0).getReg();
6319   Register Src0 = MI.getOperand(1).getReg();
6320   Register Src1 = MI.getOperand(2).getReg();
6321 
6322   const LLT Src0Ty = MRI.getType(Src0);
6323   const LLT Src1Ty = MRI.getType(Src1);
6324 
6325   const int Src0Size = Src0Ty.getScalarSizeInBits();
6326   const int Src1Size = Src1Ty.getScalarSizeInBits();
6327 
6328   auto SignBitMask = MIRBuilder.buildConstant(
6329     Src0Ty, APInt::getSignMask(Src0Size));
6330 
6331   auto NotSignBitMask = MIRBuilder.buildConstant(
6332     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6333 
6334   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6335   Register And1;
6336   if (Src0Ty == Src1Ty) {
6337     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6338   } else if (Src0Size > Src1Size) {
6339     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6340     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6341     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6342     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6343   } else {
6344     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6345     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6346     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6347     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6348   }
6349 
6350   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6351   // constants are a nan and -0.0, but the final result should preserve
6352   // everything.
6353   unsigned Flags = MI.getFlags();
6354   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6355 
6356   MI.eraseFromParent();
6357   return Legalized;
6358 }
6359 
6360 LegalizerHelper::LegalizeResult
6361 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6362   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6363     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6364 
6365   Register Dst = MI.getOperand(0).getReg();
6366   Register Src0 = MI.getOperand(1).getReg();
6367   Register Src1 = MI.getOperand(2).getReg();
6368   LLT Ty = MRI.getType(Dst);
6369 
6370   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6371     // Insert canonicalizes if it's possible we need to quiet to get correct
6372     // sNaN behavior.
6373 
6374     // Note this must be done here, and not as an optimization combine in the
6375     // absence of a dedicate quiet-snan instruction as we're using an
6376     // omni-purpose G_FCANONICALIZE.
6377     if (!isKnownNeverSNaN(Src0, MRI))
6378       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6379 
6380     if (!isKnownNeverSNaN(Src1, MRI))
6381       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6382   }
6383 
6384   // If there are no nans, it's safe to simply replace this with the non-IEEE
6385   // version.
6386   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6387   MI.eraseFromParent();
6388   return Legalized;
6389 }
6390 
6391 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6392   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6393   Register DstReg = MI.getOperand(0).getReg();
6394   LLT Ty = MRI.getType(DstReg);
6395   unsigned Flags = MI.getFlags();
6396 
6397   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6398                                   Flags);
6399   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6400   MI.eraseFromParent();
6401   return Legalized;
6402 }
6403 
6404 LegalizerHelper::LegalizeResult
6405 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6406   Register DstReg = MI.getOperand(0).getReg();
6407   Register X = MI.getOperand(1).getReg();
6408   const unsigned Flags = MI.getFlags();
6409   const LLT Ty = MRI.getType(DstReg);
6410   const LLT CondTy = Ty.changeElementSize(1);
6411 
6412   // round(x) =>
6413   //  t = trunc(x);
6414   //  d = fabs(x - t);
6415   //  o = copysign(1.0f, x);
6416   //  return t + (d >= 0.5 ? o : 0.0);
6417 
6418   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6419 
6420   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6421   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6422   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6423   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6424   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6425   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6426 
6427   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6428                                   Flags);
6429   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6430 
6431   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6432 
6433   MI.eraseFromParent();
6434   return Legalized;
6435 }
6436 
6437 LegalizerHelper::LegalizeResult
6438 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6439   Register DstReg = MI.getOperand(0).getReg();
6440   Register SrcReg = MI.getOperand(1).getReg();
6441   unsigned Flags = MI.getFlags();
6442   LLT Ty = MRI.getType(DstReg);
6443   const LLT CondTy = Ty.changeElementSize(1);
6444 
6445   // result = trunc(src);
6446   // if (src < 0.0 && src != result)
6447   //   result += -1.0.
6448 
6449   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6450   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6451 
6452   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6453                                   SrcReg, Zero, Flags);
6454   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6455                                       SrcReg, Trunc, Flags);
6456   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6457   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6458 
6459   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6460   MI.eraseFromParent();
6461   return Legalized;
6462 }
6463 
6464 LegalizerHelper::LegalizeResult
6465 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6466   const unsigned NumOps = MI.getNumOperands();
6467   Register DstReg = MI.getOperand(0).getReg();
6468   Register Src0Reg = MI.getOperand(1).getReg();
6469   LLT DstTy = MRI.getType(DstReg);
6470   LLT SrcTy = MRI.getType(Src0Reg);
6471   unsigned PartSize = SrcTy.getSizeInBits();
6472 
6473   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6474   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6475 
6476   for (unsigned I = 2; I != NumOps; ++I) {
6477     const unsigned Offset = (I - 1) * PartSize;
6478 
6479     Register SrcReg = MI.getOperand(I).getReg();
6480     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6481 
6482     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6483       MRI.createGenericVirtualRegister(WideTy);
6484 
6485     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6486     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6487     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6488     ResultReg = NextResult;
6489   }
6490 
6491   if (DstTy.isPointer()) {
6492     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6493           DstTy.getAddressSpace())) {
6494       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6495       return UnableToLegalize;
6496     }
6497 
6498     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6499   }
6500 
6501   MI.eraseFromParent();
6502   return Legalized;
6503 }
6504 
6505 LegalizerHelper::LegalizeResult
6506 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6507   const unsigned NumDst = MI.getNumOperands() - 1;
6508   Register SrcReg = MI.getOperand(NumDst).getReg();
6509   Register Dst0Reg = MI.getOperand(0).getReg();
6510   LLT DstTy = MRI.getType(Dst0Reg);
6511   if (DstTy.isPointer())
6512     return UnableToLegalize; // TODO
6513 
6514   SrcReg = coerceToScalar(SrcReg);
6515   if (!SrcReg)
6516     return UnableToLegalize;
6517 
6518   // Expand scalarizing unmerge as bitcast to integer and shift.
6519   LLT IntTy = MRI.getType(SrcReg);
6520 
6521   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6522 
6523   const unsigned DstSize = DstTy.getSizeInBits();
6524   unsigned Offset = DstSize;
6525   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6526     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6527     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6528     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6529   }
6530 
6531   MI.eraseFromParent();
6532   return Legalized;
6533 }
6534 
6535 /// Lower a vector extract or insert by writing the vector to a stack temporary
6536 /// and reloading the element or vector.
6537 ///
6538 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6539 ///  =>
6540 ///  %stack_temp = G_FRAME_INDEX
6541 ///  G_STORE %vec, %stack_temp
6542 ///  %idx = clamp(%idx, %vec.getNumElements())
6543 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6544 ///  %dst = G_LOAD %element_ptr
6545 LegalizerHelper::LegalizeResult
6546 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6547   Register DstReg = MI.getOperand(0).getReg();
6548   Register SrcVec = MI.getOperand(1).getReg();
6549   Register InsertVal;
6550   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6551     InsertVal = MI.getOperand(2).getReg();
6552 
6553   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6554 
6555   LLT VecTy = MRI.getType(SrcVec);
6556   LLT EltTy = VecTy.getElementType();
6557   if (!EltTy.isByteSized()) { // Not implemented.
6558     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6559     return UnableToLegalize;
6560   }
6561 
6562   unsigned EltBytes = EltTy.getSizeInBytes();
6563   Align VecAlign = getStackTemporaryAlignment(VecTy);
6564   Align EltAlign;
6565 
6566   MachinePointerInfo PtrInfo;
6567   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6568                                         VecAlign, PtrInfo);
6569   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6570 
6571   // Get the pointer to the element, and be sure not to hit undefined behavior
6572   // if the index is out of bounds.
6573   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6574 
6575   int64_t IdxVal;
6576   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6577     int64_t Offset = IdxVal * EltBytes;
6578     PtrInfo = PtrInfo.getWithOffset(Offset);
6579     EltAlign = commonAlignment(VecAlign, Offset);
6580   } else {
6581     // We lose information with a variable offset.
6582     EltAlign = getStackTemporaryAlignment(EltTy);
6583     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6584   }
6585 
6586   if (InsertVal) {
6587     // Write the inserted element
6588     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6589 
6590     // Reload the whole vector.
6591     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6592   } else {
6593     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6594   }
6595 
6596   MI.eraseFromParent();
6597   return Legalized;
6598 }
6599 
6600 LegalizerHelper::LegalizeResult
6601 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6602   Register DstReg = MI.getOperand(0).getReg();
6603   Register Src0Reg = MI.getOperand(1).getReg();
6604   Register Src1Reg = MI.getOperand(2).getReg();
6605   LLT Src0Ty = MRI.getType(Src0Reg);
6606   LLT DstTy = MRI.getType(DstReg);
6607   LLT IdxTy = LLT::scalar(32);
6608 
6609   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6610 
6611   if (DstTy.isScalar()) {
6612     if (Src0Ty.isVector())
6613       return UnableToLegalize;
6614 
6615     // This is just a SELECT.
6616     assert(Mask.size() == 1 && "Expected a single mask element");
6617     Register Val;
6618     if (Mask[0] < 0 || Mask[0] > 1)
6619       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6620     else
6621       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6622     MIRBuilder.buildCopy(DstReg, Val);
6623     MI.eraseFromParent();
6624     return Legalized;
6625   }
6626 
6627   Register Undef;
6628   SmallVector<Register, 32> BuildVec;
6629   LLT EltTy = DstTy.getElementType();
6630 
6631   for (int Idx : Mask) {
6632     if (Idx < 0) {
6633       if (!Undef.isValid())
6634         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6635       BuildVec.push_back(Undef);
6636       continue;
6637     }
6638 
6639     if (Src0Ty.isScalar()) {
6640       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6641     } else {
6642       int NumElts = Src0Ty.getNumElements();
6643       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6644       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6645       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6646       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6647       BuildVec.push_back(Extract.getReg(0));
6648     }
6649   }
6650 
6651   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6652   MI.eraseFromParent();
6653   return Legalized;
6654 }
6655 
6656 LegalizerHelper::LegalizeResult
6657 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6658   const auto &MF = *MI.getMF();
6659   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6660   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6661     return UnableToLegalize;
6662 
6663   Register Dst = MI.getOperand(0).getReg();
6664   Register AllocSize = MI.getOperand(1).getReg();
6665   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6666 
6667   LLT PtrTy = MRI.getType(Dst);
6668   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6669 
6670   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6671   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6672   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6673 
6674   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6675   // have to generate an extra instruction to negate the alloc and then use
6676   // G_PTR_ADD to add the negative offset.
6677   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6678   if (Alignment > Align(1)) {
6679     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6680     AlignMask.negate();
6681     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6682     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6683   }
6684 
6685   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6686   MIRBuilder.buildCopy(SPReg, SPTmp);
6687   MIRBuilder.buildCopy(Dst, SPTmp);
6688 
6689   MI.eraseFromParent();
6690   return Legalized;
6691 }
6692 
6693 LegalizerHelper::LegalizeResult
6694 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6695   Register Dst = MI.getOperand(0).getReg();
6696   Register Src = MI.getOperand(1).getReg();
6697   unsigned Offset = MI.getOperand(2).getImm();
6698 
6699   LLT DstTy = MRI.getType(Dst);
6700   LLT SrcTy = MRI.getType(Src);
6701 
6702   if (DstTy.isScalar() &&
6703       (SrcTy.isScalar() ||
6704        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6705     LLT SrcIntTy = SrcTy;
6706     if (!SrcTy.isScalar()) {
6707       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6708       Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6709     }
6710 
6711     if (Offset == 0)
6712       MIRBuilder.buildTrunc(Dst, Src);
6713     else {
6714       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6715       auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6716       MIRBuilder.buildTrunc(Dst, Shr);
6717     }
6718 
6719     MI.eraseFromParent();
6720     return Legalized;
6721   }
6722 
6723   return UnableToLegalize;
6724 }
6725 
6726 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6727   Register Dst = MI.getOperand(0).getReg();
6728   Register Src = MI.getOperand(1).getReg();
6729   Register InsertSrc = MI.getOperand(2).getReg();
6730   uint64_t Offset = MI.getOperand(3).getImm();
6731 
6732   LLT DstTy = MRI.getType(Src);
6733   LLT InsertTy = MRI.getType(InsertSrc);
6734 
6735   if (InsertTy.isVector() ||
6736       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6737     return UnableToLegalize;
6738 
6739   const DataLayout &DL = MIRBuilder.getDataLayout();
6740   if ((DstTy.isPointer() &&
6741        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6742       (InsertTy.isPointer() &&
6743        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6744     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6745     return UnableToLegalize;
6746   }
6747 
6748   LLT IntDstTy = DstTy;
6749 
6750   if (!DstTy.isScalar()) {
6751     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6752     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6753   }
6754 
6755   if (!InsertTy.isScalar()) {
6756     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6757     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6758   }
6759 
6760   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6761   if (Offset != 0) {
6762     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6763     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6764   }
6765 
6766   APInt MaskVal = APInt::getBitsSetWithWrap(
6767       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6768 
6769   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6770   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6771   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6772 
6773   MIRBuilder.buildCast(Dst, Or);
6774   MI.eraseFromParent();
6775   return Legalized;
6776 }
6777 
6778 LegalizerHelper::LegalizeResult
6779 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
6780   Register Dst0 = MI.getOperand(0).getReg();
6781   Register Dst1 = MI.getOperand(1).getReg();
6782   Register LHS = MI.getOperand(2).getReg();
6783   Register RHS = MI.getOperand(3).getReg();
6784   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
6785 
6786   LLT Ty = MRI.getType(Dst0);
6787   LLT BoolTy = MRI.getType(Dst1);
6788 
6789   if (IsAdd)
6790     MIRBuilder.buildAdd(Dst0, LHS, RHS);
6791   else
6792     MIRBuilder.buildSub(Dst0, LHS, RHS);
6793 
6794   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
6795 
6796   auto Zero = MIRBuilder.buildConstant(Ty, 0);
6797 
6798   // For an addition, the result should be less than one of the operands (LHS)
6799   // if and only if the other operand (RHS) is negative, otherwise there will
6800   // be overflow.
6801   // For a subtraction, the result should be less than one of the operands
6802   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
6803   // otherwise there will be overflow.
6804   auto ResultLowerThanLHS =
6805       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
6806   auto ConditionRHS = MIRBuilder.buildICmp(
6807       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
6808 
6809   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
6810   MI.eraseFromParent();
6811   return Legalized;
6812 }
6813 
6814 LegalizerHelper::LegalizeResult
6815 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
6816   Register Res = MI.getOperand(0).getReg();
6817   Register LHS = MI.getOperand(1).getReg();
6818   Register RHS = MI.getOperand(2).getReg();
6819   LLT Ty = MRI.getType(Res);
6820   bool IsSigned;
6821   bool IsAdd;
6822   unsigned BaseOp;
6823   switch (MI.getOpcode()) {
6824   default:
6825     llvm_unreachable("unexpected addsat/subsat opcode");
6826   case TargetOpcode::G_UADDSAT:
6827     IsSigned = false;
6828     IsAdd = true;
6829     BaseOp = TargetOpcode::G_ADD;
6830     break;
6831   case TargetOpcode::G_SADDSAT:
6832     IsSigned = true;
6833     IsAdd = true;
6834     BaseOp = TargetOpcode::G_ADD;
6835     break;
6836   case TargetOpcode::G_USUBSAT:
6837     IsSigned = false;
6838     IsAdd = false;
6839     BaseOp = TargetOpcode::G_SUB;
6840     break;
6841   case TargetOpcode::G_SSUBSAT:
6842     IsSigned = true;
6843     IsAdd = false;
6844     BaseOp = TargetOpcode::G_SUB;
6845     break;
6846   }
6847 
6848   if (IsSigned) {
6849     // sadd.sat(a, b) ->
6850     //   hi = 0x7fffffff - smax(a, 0)
6851     //   lo = 0x80000000 - smin(a, 0)
6852     //   a + smin(smax(lo, b), hi)
6853     // ssub.sat(a, b) ->
6854     //   lo = smax(a, -1) - 0x7fffffff
6855     //   hi = smin(a, -1) - 0x80000000
6856     //   a - smin(smax(lo, b), hi)
6857     // TODO: AMDGPU can use a "median of 3" instruction here:
6858     //   a +/- med3(lo, b, hi)
6859     uint64_t NumBits = Ty.getScalarSizeInBits();
6860     auto MaxVal =
6861         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
6862     auto MinVal =
6863         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
6864     MachineInstrBuilder Hi, Lo;
6865     if (IsAdd) {
6866       auto Zero = MIRBuilder.buildConstant(Ty, 0);
6867       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
6868       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
6869     } else {
6870       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
6871       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
6872                                MaxVal);
6873       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
6874                                MinVal);
6875     }
6876     auto RHSClamped =
6877         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
6878     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
6879   } else {
6880     // uadd.sat(a, b) -> a + umin(~a, b)
6881     // usub.sat(a, b) -> a - umin(a, b)
6882     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
6883     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
6884     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
6885   }
6886 
6887   MI.eraseFromParent();
6888   return Legalized;
6889 }
6890 
6891 LegalizerHelper::LegalizeResult
6892 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
6893   Register Res = MI.getOperand(0).getReg();
6894   Register LHS = MI.getOperand(1).getReg();
6895   Register RHS = MI.getOperand(2).getReg();
6896   LLT Ty = MRI.getType(Res);
6897   LLT BoolTy = Ty.changeElementSize(1);
6898   bool IsSigned;
6899   bool IsAdd;
6900   unsigned OverflowOp;
6901   switch (MI.getOpcode()) {
6902   default:
6903     llvm_unreachable("unexpected addsat/subsat opcode");
6904   case TargetOpcode::G_UADDSAT:
6905     IsSigned = false;
6906     IsAdd = true;
6907     OverflowOp = TargetOpcode::G_UADDO;
6908     break;
6909   case TargetOpcode::G_SADDSAT:
6910     IsSigned = true;
6911     IsAdd = true;
6912     OverflowOp = TargetOpcode::G_SADDO;
6913     break;
6914   case TargetOpcode::G_USUBSAT:
6915     IsSigned = false;
6916     IsAdd = false;
6917     OverflowOp = TargetOpcode::G_USUBO;
6918     break;
6919   case TargetOpcode::G_SSUBSAT:
6920     IsSigned = true;
6921     IsAdd = false;
6922     OverflowOp = TargetOpcode::G_SSUBO;
6923     break;
6924   }
6925 
6926   auto OverflowRes =
6927       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
6928   Register Tmp = OverflowRes.getReg(0);
6929   Register Ov = OverflowRes.getReg(1);
6930   MachineInstrBuilder Clamp;
6931   if (IsSigned) {
6932     // sadd.sat(a, b) ->
6933     //   {tmp, ov} = saddo(a, b)
6934     //   ov ? (tmp >>s 31) + 0x80000000 : r
6935     // ssub.sat(a, b) ->
6936     //   {tmp, ov} = ssubo(a, b)
6937     //   ov ? (tmp >>s 31) + 0x80000000 : r
6938     uint64_t NumBits = Ty.getScalarSizeInBits();
6939     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
6940     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
6941     auto MinVal =
6942         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
6943     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
6944   } else {
6945     // uadd.sat(a, b) ->
6946     //   {tmp, ov} = uaddo(a, b)
6947     //   ov ? 0xffffffff : tmp
6948     // usub.sat(a, b) ->
6949     //   {tmp, ov} = usubo(a, b)
6950     //   ov ? 0 : tmp
6951     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
6952   }
6953   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
6954 
6955   MI.eraseFromParent();
6956   return Legalized;
6957 }
6958 
6959 LegalizerHelper::LegalizeResult
6960 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
6961   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
6962           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
6963          "Expected shlsat opcode!");
6964   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
6965   Register Res = MI.getOperand(0).getReg();
6966   Register LHS = MI.getOperand(1).getReg();
6967   Register RHS = MI.getOperand(2).getReg();
6968   LLT Ty = MRI.getType(Res);
6969   LLT BoolTy = Ty.changeElementSize(1);
6970 
6971   unsigned BW = Ty.getScalarSizeInBits();
6972   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
6973   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
6974                        : MIRBuilder.buildLShr(Ty, Result, RHS);
6975 
6976   MachineInstrBuilder SatVal;
6977   if (IsSigned) {
6978     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
6979     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
6980     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
6981                                     MIRBuilder.buildConstant(Ty, 0));
6982     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
6983   } else {
6984     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
6985   }
6986   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
6987   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
6988 
6989   MI.eraseFromParent();
6990   return Legalized;
6991 }
6992 
6993 LegalizerHelper::LegalizeResult
6994 LegalizerHelper::lowerBswap(MachineInstr &MI) {
6995   Register Dst = MI.getOperand(0).getReg();
6996   Register Src = MI.getOperand(1).getReg();
6997   const LLT Ty = MRI.getType(Src);
6998   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
6999   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7000 
7001   // Swap most and least significant byte, set remaining bytes in Res to zero.
7002   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7003   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7004   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7005   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7006 
7007   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7008   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7009     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7010     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7011     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7012     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7013     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7014     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7015     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7016     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7017     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7018     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7019     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7020     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7021   }
7022   Res.getInstr()->getOperand(0).setReg(Dst);
7023 
7024   MI.eraseFromParent();
7025   return Legalized;
7026 }
7027 
7028 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7029 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7030                                  MachineInstrBuilder Src, APInt Mask) {
7031   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7032   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7033   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7034   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7035   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7036   return B.buildOr(Dst, LHS, RHS);
7037 }
7038 
7039 LegalizerHelper::LegalizeResult
7040 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7041   Register Dst = MI.getOperand(0).getReg();
7042   Register Src = MI.getOperand(1).getReg();
7043   const LLT Ty = MRI.getType(Src);
7044   unsigned Size = Ty.getSizeInBits();
7045 
7046   MachineInstrBuilder BSWAP =
7047       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7048 
7049   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7050   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7051   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7052   MachineInstrBuilder Swap4 =
7053       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7054 
7055   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7056   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7057   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7058   MachineInstrBuilder Swap2 =
7059       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7060 
7061   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7062   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7063   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7064   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7065 
7066   MI.eraseFromParent();
7067   return Legalized;
7068 }
7069 
7070 LegalizerHelper::LegalizeResult
7071 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7072   MachineFunction &MF = MIRBuilder.getMF();
7073 
7074   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7075   int NameOpIdx = IsRead ? 1 : 0;
7076   int ValRegIndex = IsRead ? 0 : 1;
7077 
7078   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7079   const LLT Ty = MRI.getType(ValReg);
7080   const MDString *RegStr = cast<MDString>(
7081     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7082 
7083   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7084   if (!PhysReg.isValid())
7085     return UnableToLegalize;
7086 
7087   if (IsRead)
7088     MIRBuilder.buildCopy(ValReg, PhysReg);
7089   else
7090     MIRBuilder.buildCopy(PhysReg, ValReg);
7091 
7092   MI.eraseFromParent();
7093   return Legalized;
7094 }
7095 
7096 LegalizerHelper::LegalizeResult
7097 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7098   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7099   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7100   Register Result = MI.getOperand(0).getReg();
7101   LLT OrigTy = MRI.getType(Result);
7102   auto SizeInBits = OrigTy.getScalarSizeInBits();
7103   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7104 
7105   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7106   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7107   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7108   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7109 
7110   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7111   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7112   MIRBuilder.buildTrunc(Result, Shifted);
7113 
7114   MI.eraseFromParent();
7115   return Legalized;
7116 }
7117 
7118 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7119   // Implement vector G_SELECT in terms of XOR, AND, OR.
7120   Register DstReg = MI.getOperand(0).getReg();
7121   Register MaskReg = MI.getOperand(1).getReg();
7122   Register Op1Reg = MI.getOperand(2).getReg();
7123   Register Op2Reg = MI.getOperand(3).getReg();
7124   LLT DstTy = MRI.getType(DstReg);
7125   LLT MaskTy = MRI.getType(MaskReg);
7126   LLT Op1Ty = MRI.getType(Op1Reg);
7127   if (!DstTy.isVector())
7128     return UnableToLegalize;
7129 
7130   // Vector selects can have a scalar predicate. If so, splat into a vector and
7131   // finish for later legalization attempts to try again.
7132   if (MaskTy.isScalar()) {
7133     Register MaskElt = MaskReg;
7134     if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
7135       MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
7136     // Generate a vector splat idiom to be pattern matched later.
7137     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7138     Observer.changingInstr(MI);
7139     MI.getOperand(1).setReg(ShufSplat.getReg(0));
7140     Observer.changedInstr(MI);
7141     return Legalized;
7142   }
7143 
7144   if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
7145     return UnableToLegalize;
7146   }
7147 
7148   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7149   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7150   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7151   MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7152   MI.eraseFromParent();
7153   return Legalized;
7154 }
7155 
7156 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7157   // Split DIVREM into individual instructions.
7158   unsigned Opcode = MI.getOpcode();
7159 
7160   MIRBuilder.buildInstr(
7161       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7162                                         : TargetOpcode::G_UDIV,
7163       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7164   MIRBuilder.buildInstr(
7165       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7166                                         : TargetOpcode::G_UREM,
7167       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7168   MI.eraseFromParent();
7169   return Legalized;
7170 }
7171 
7172 LegalizerHelper::LegalizeResult
7173 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7174   // Expand %res = G_ABS %a into:
7175   // %v1 = G_ASHR %a, scalar_size-1
7176   // %v2 = G_ADD %a, %v1
7177   // %res = G_XOR %v2, %v1
7178   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7179   Register OpReg = MI.getOperand(1).getReg();
7180   auto ShiftAmt =
7181       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7182   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7183   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7184   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7185   MI.eraseFromParent();
7186   return Legalized;
7187 }
7188 
7189 LegalizerHelper::LegalizeResult
7190 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7191   // Expand %res = G_ABS %a into:
7192   // %v1 = G_CONSTANT 0
7193   // %v2 = G_SUB %v1, %a
7194   // %res = G_SMAX %a, %v2
7195   Register SrcReg = MI.getOperand(1).getReg();
7196   LLT Ty = MRI.getType(SrcReg);
7197   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7198   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7199   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7200   MI.eraseFromParent();
7201   return Legalized;
7202 }
7203