1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/LowLevelType.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/TargetFrameLowering.h"
25 #include "llvm/CodeGen/TargetInstrInfo.h"
26 #include "llvm/CodeGen/TargetLowering.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/TargetSubtargetInfo.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/MathExtras.h"
32 #include "llvm/Support/raw_ostream.h"
33 
34 #define DEBUG_TYPE "legalizer"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace MIPatternMatch;
39 
40 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
41 ///
42 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
43 /// with any leftover piece as type \p LeftoverTy
44 ///
45 /// Returns -1 in the first element of the pair if the breakdown is not
46 /// satisfiable.
47 static std::pair<int, int>
48 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
49   assert(!LeftoverTy.isValid() && "this is an out argument");
50 
51   unsigned Size = OrigTy.getSizeInBits();
52   unsigned NarrowSize = NarrowTy.getSizeInBits();
53   unsigned NumParts = Size / NarrowSize;
54   unsigned LeftoverSize = Size - NumParts * NarrowSize;
55   assert(Size > NarrowSize);
56 
57   if (LeftoverSize == 0)
58     return {NumParts, 0};
59 
60   if (NarrowTy.isVector()) {
61     unsigned EltSize = OrigTy.getScalarSizeInBits();
62     if (LeftoverSize % EltSize != 0)
63       return {-1, -1};
64     LeftoverTy = LLT::scalarOrVector(
65         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
66   } else {
67     LeftoverTy = LLT::scalar(LeftoverSize);
68   }
69 
70   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
71   return std::make_pair(NumParts, NumLeftover);
72 }
73 
74 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
75 
76   if (!Ty.isScalar())
77     return nullptr;
78 
79   switch (Ty.getSizeInBits()) {
80   case 16:
81     return Type::getHalfTy(Ctx);
82   case 32:
83     return Type::getFloatTy(Ctx);
84   case 64:
85     return Type::getDoubleTy(Ctx);
86   case 80:
87     return Type::getX86_FP80Ty(Ctx);
88   case 128:
89     return Type::getFP128Ty(Ctx);
90   default:
91     return nullptr;
92   }
93 }
94 
95 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
96                                  GISelChangeObserver &Observer,
97                                  MachineIRBuilder &Builder)
98     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
99       LI(*MF.getSubtarget().getLegalizerInfo()),
100       TLI(*MF.getSubtarget().getTargetLowering()) { }
101 
102 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
103                                  GISelChangeObserver &Observer,
104                                  MachineIRBuilder &B)
105   : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
106     TLI(*MF.getSubtarget().getTargetLowering()) { }
107 
108 LegalizerHelper::LegalizeResult
109 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
110                                    LostDebugLocObserver &LocObserver) {
111   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
112 
113   MIRBuilder.setInstrAndDebugLoc(MI);
114 
115   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
116       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
117     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
118   auto Step = LI.getAction(MI, MRI);
119   switch (Step.Action) {
120   case Legal:
121     LLVM_DEBUG(dbgs() << ".. Already legal\n");
122     return AlreadyLegal;
123   case Libcall:
124     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
125     return libcall(MI, LocObserver);
126   case NarrowScalar:
127     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
128     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
129   case WidenScalar:
130     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
131     return widenScalar(MI, Step.TypeIdx, Step.NewType);
132   case Bitcast:
133     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
134     return bitcast(MI, Step.TypeIdx, Step.NewType);
135   case Lower:
136     LLVM_DEBUG(dbgs() << ".. Lower\n");
137     return lower(MI, Step.TypeIdx, Step.NewType);
138   case FewerElements:
139     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
140     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
141   case MoreElements:
142     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
143     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
144   case Custom:
145     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
146     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
147   default:
148     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
149     return UnableToLegalize;
150   }
151 }
152 
153 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
154                                    SmallVectorImpl<Register> &VRegs) {
155   for (int i = 0; i < NumParts; ++i)
156     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
157   MIRBuilder.buildUnmerge(VRegs, Reg);
158 }
159 
160 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
161                                    LLT MainTy, LLT &LeftoverTy,
162                                    SmallVectorImpl<Register> &VRegs,
163                                    SmallVectorImpl<Register> &LeftoverRegs) {
164   assert(!LeftoverTy.isValid() && "this is an out argument");
165 
166   unsigned RegSize = RegTy.getSizeInBits();
167   unsigned MainSize = MainTy.getSizeInBits();
168   unsigned NumParts = RegSize / MainSize;
169   unsigned LeftoverSize = RegSize - NumParts * MainSize;
170 
171   // Use an unmerge when possible.
172   if (LeftoverSize == 0) {
173     for (unsigned I = 0; I < NumParts; ++I)
174       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
175     MIRBuilder.buildUnmerge(VRegs, Reg);
176     return true;
177   }
178 
179   if (MainTy.isVector()) {
180     unsigned EltSize = MainTy.getScalarSizeInBits();
181     if (LeftoverSize % EltSize != 0)
182       return false;
183     LeftoverTy = LLT::scalarOrVector(
184         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
185   } else {
186     LeftoverTy = LLT::scalar(LeftoverSize);
187   }
188 
189   // For irregular sizes, extract the individual parts.
190   for (unsigned I = 0; I != NumParts; ++I) {
191     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
192     VRegs.push_back(NewReg);
193     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
194   }
195 
196   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
197        Offset += LeftoverSize) {
198     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
199     LeftoverRegs.push_back(NewReg);
200     MIRBuilder.buildExtract(NewReg, Reg, Offset);
201   }
202 
203   return true;
204 }
205 
206 void LegalizerHelper::insertParts(Register DstReg,
207                                   LLT ResultTy, LLT PartTy,
208                                   ArrayRef<Register> PartRegs,
209                                   LLT LeftoverTy,
210                                   ArrayRef<Register> LeftoverRegs) {
211   if (!LeftoverTy.isValid()) {
212     assert(LeftoverRegs.empty());
213 
214     if (!ResultTy.isVector()) {
215       MIRBuilder.buildMerge(DstReg, PartRegs);
216       return;
217     }
218 
219     if (PartTy.isVector())
220       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
221     else
222       MIRBuilder.buildBuildVector(DstReg, PartRegs);
223     return;
224   }
225 
226   SmallVector<Register> GCDRegs;
227   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
228   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
229     extractGCDType(GCDRegs, GCDTy, PartReg);
230   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
231   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
232 }
233 
234 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
235 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
236                               const MachineInstr &MI) {
237   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
238 
239   const int StartIdx = Regs.size();
240   const int NumResults = MI.getNumOperands() - 1;
241   Regs.resize(Regs.size() + NumResults);
242   for (int I = 0; I != NumResults; ++I)
243     Regs[StartIdx + I] = MI.getOperand(I).getReg();
244 }
245 
246 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
247                                      LLT GCDTy, Register SrcReg) {
248   LLT SrcTy = MRI.getType(SrcReg);
249   if (SrcTy == GCDTy) {
250     // If the source already evenly divides the result type, we don't need to do
251     // anything.
252     Parts.push_back(SrcReg);
253   } else {
254     // Need to split into common type sized pieces.
255     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
256     getUnmergeResults(Parts, *Unmerge);
257   }
258 }
259 
260 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
261                                     LLT NarrowTy, Register SrcReg) {
262   LLT SrcTy = MRI.getType(SrcReg);
263   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
264   extractGCDType(Parts, GCDTy, SrcReg);
265   return GCDTy;
266 }
267 
268 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
269                                          SmallVectorImpl<Register> &VRegs,
270                                          unsigned PadStrategy) {
271   LLT LCMTy = getLCMType(DstTy, NarrowTy);
272 
273   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
274   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
275   int NumOrigSrc = VRegs.size();
276 
277   Register PadReg;
278 
279   // Get a value we can use to pad the source value if the sources won't evenly
280   // cover the result type.
281   if (NumOrigSrc < NumParts * NumSubParts) {
282     if (PadStrategy == TargetOpcode::G_ZEXT)
283       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
284     else if (PadStrategy == TargetOpcode::G_ANYEXT)
285       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
286     else {
287       assert(PadStrategy == TargetOpcode::G_SEXT);
288 
289       // Shift the sign bit of the low register through the high register.
290       auto ShiftAmt =
291         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
292       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
293     }
294   }
295 
296   // Registers for the final merge to be produced.
297   SmallVector<Register, 4> Remerge(NumParts);
298 
299   // Registers needed for intermediate merges, which will be merged into a
300   // source for Remerge.
301   SmallVector<Register, 4> SubMerge(NumSubParts);
302 
303   // Once we've fully read off the end of the original source bits, we can reuse
304   // the same high bits for remaining padding elements.
305   Register AllPadReg;
306 
307   // Build merges to the LCM type to cover the original result type.
308   for (int I = 0; I != NumParts; ++I) {
309     bool AllMergePartsArePadding = true;
310 
311     // Build the requested merges to the requested type.
312     for (int J = 0; J != NumSubParts; ++J) {
313       int Idx = I * NumSubParts + J;
314       if (Idx >= NumOrigSrc) {
315         SubMerge[J] = PadReg;
316         continue;
317       }
318 
319       SubMerge[J] = VRegs[Idx];
320 
321       // There are meaningful bits here we can't reuse later.
322       AllMergePartsArePadding = false;
323     }
324 
325     // If we've filled up a complete piece with padding bits, we can directly
326     // emit the natural sized constant if applicable, rather than a merge of
327     // smaller constants.
328     if (AllMergePartsArePadding && !AllPadReg) {
329       if (PadStrategy == TargetOpcode::G_ANYEXT)
330         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
331       else if (PadStrategy == TargetOpcode::G_ZEXT)
332         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
333 
334       // If this is a sign extension, we can't materialize a trivial constant
335       // with the right type and have to produce a merge.
336     }
337 
338     if (AllPadReg) {
339       // Avoid creating additional instructions if we're just adding additional
340       // copies of padding bits.
341       Remerge[I] = AllPadReg;
342       continue;
343     }
344 
345     if (NumSubParts == 1)
346       Remerge[I] = SubMerge[0];
347     else
348       Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
349 
350     // In the sign extend padding case, re-use the first all-signbit merge.
351     if (AllMergePartsArePadding && !AllPadReg)
352       AllPadReg = Remerge[I];
353   }
354 
355   VRegs = std::move(Remerge);
356   return LCMTy;
357 }
358 
359 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
360                                                ArrayRef<Register> RemergeRegs) {
361   LLT DstTy = MRI.getType(DstReg);
362 
363   // Create the merge to the widened source, and extract the relevant bits into
364   // the result.
365 
366   if (DstTy == LCMTy) {
367     MIRBuilder.buildMerge(DstReg, RemergeRegs);
368     return;
369   }
370 
371   auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
372   if (DstTy.isScalar() && LCMTy.isScalar()) {
373     MIRBuilder.buildTrunc(DstReg, Remerge);
374     return;
375   }
376 
377   if (LCMTy.isVector()) {
378     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
379     SmallVector<Register, 8> UnmergeDefs(NumDefs);
380     UnmergeDefs[0] = DstReg;
381     for (unsigned I = 1; I != NumDefs; ++I)
382       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
383 
384     MIRBuilder.buildUnmerge(UnmergeDefs,
385                             MIRBuilder.buildMerge(LCMTy, RemergeRegs));
386     return;
387   }
388 
389   llvm_unreachable("unhandled case");
390 }
391 
392 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
393 #define RTLIBCASE_INT(LibcallPrefix)                                           \
394   do {                                                                         \
395     switch (Size) {                                                            \
396     case 32:                                                                   \
397       return RTLIB::LibcallPrefix##32;                                         \
398     case 64:                                                                   \
399       return RTLIB::LibcallPrefix##64;                                         \
400     case 128:                                                                  \
401       return RTLIB::LibcallPrefix##128;                                        \
402     default:                                                                   \
403       llvm_unreachable("unexpected size");                                     \
404     }                                                                          \
405   } while (0)
406 
407 #define RTLIBCASE(LibcallPrefix)                                               \
408   do {                                                                         \
409     switch (Size) {                                                            \
410     case 32:                                                                   \
411       return RTLIB::LibcallPrefix##32;                                         \
412     case 64:                                                                   \
413       return RTLIB::LibcallPrefix##64;                                         \
414     case 80:                                                                   \
415       return RTLIB::LibcallPrefix##80;                                         \
416     case 128:                                                                  \
417       return RTLIB::LibcallPrefix##128;                                        \
418     default:                                                                   \
419       llvm_unreachable("unexpected size");                                     \
420     }                                                                          \
421   } while (0)
422 
423   switch (Opcode) {
424   case TargetOpcode::G_SDIV:
425     RTLIBCASE_INT(SDIV_I);
426   case TargetOpcode::G_UDIV:
427     RTLIBCASE_INT(UDIV_I);
428   case TargetOpcode::G_SREM:
429     RTLIBCASE_INT(SREM_I);
430   case TargetOpcode::G_UREM:
431     RTLIBCASE_INT(UREM_I);
432   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
433     RTLIBCASE_INT(CTLZ_I);
434   case TargetOpcode::G_FADD:
435     RTLIBCASE(ADD_F);
436   case TargetOpcode::G_FSUB:
437     RTLIBCASE(SUB_F);
438   case TargetOpcode::G_FMUL:
439     RTLIBCASE(MUL_F);
440   case TargetOpcode::G_FDIV:
441     RTLIBCASE(DIV_F);
442   case TargetOpcode::G_FEXP:
443     RTLIBCASE(EXP_F);
444   case TargetOpcode::G_FEXP2:
445     RTLIBCASE(EXP2_F);
446   case TargetOpcode::G_FREM:
447     RTLIBCASE(REM_F);
448   case TargetOpcode::G_FPOW:
449     RTLIBCASE(POW_F);
450   case TargetOpcode::G_FMA:
451     RTLIBCASE(FMA_F);
452   case TargetOpcode::G_FSIN:
453     RTLIBCASE(SIN_F);
454   case TargetOpcode::G_FCOS:
455     RTLIBCASE(COS_F);
456   case TargetOpcode::G_FLOG10:
457     RTLIBCASE(LOG10_F);
458   case TargetOpcode::G_FLOG:
459     RTLIBCASE(LOG_F);
460   case TargetOpcode::G_FLOG2:
461     RTLIBCASE(LOG2_F);
462   case TargetOpcode::G_FCEIL:
463     RTLIBCASE(CEIL_F);
464   case TargetOpcode::G_FFLOOR:
465     RTLIBCASE(FLOOR_F);
466   case TargetOpcode::G_FMINNUM:
467     RTLIBCASE(FMIN_F);
468   case TargetOpcode::G_FMAXNUM:
469     RTLIBCASE(FMAX_F);
470   case TargetOpcode::G_FSQRT:
471     RTLIBCASE(SQRT_F);
472   case TargetOpcode::G_FRINT:
473     RTLIBCASE(RINT_F);
474   case TargetOpcode::G_FNEARBYINT:
475     RTLIBCASE(NEARBYINT_F);
476   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
477     RTLIBCASE(ROUNDEVEN_F);
478   }
479   llvm_unreachable("Unknown libcall function");
480 }
481 
482 /// True if an instruction is in tail position in its caller. Intended for
483 /// legalizing libcalls as tail calls when possible.
484 static bool isLibCallInTailPosition(MachineInstr &MI,
485                                     const TargetInstrInfo &TII,
486                                     MachineRegisterInfo &MRI) {
487   MachineBasicBlock &MBB = *MI.getParent();
488   const Function &F = MBB.getParent()->getFunction();
489 
490   // Conservatively require the attributes of the call to match those of
491   // the return. Ignore NoAlias and NonNull because they don't affect the
492   // call sequence.
493   AttributeList CallerAttrs = F.getAttributes();
494   if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
495           .removeAttribute(Attribute::NoAlias)
496           .removeAttribute(Attribute::NonNull)
497           .hasAttributes())
498     return false;
499 
500   // It's not safe to eliminate the sign / zero extension of the return value.
501   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
502       CallerAttrs.hasRetAttr(Attribute::SExt))
503     return false;
504 
505   // Only tail call if the following instruction is a standard return or if we
506   // have a `thisreturn` callee, and a sequence like:
507   //
508   //   G_MEMCPY %0, %1, %2
509   //   $x0 = COPY %0
510   //   RET_ReallyLR implicit $x0
511   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
512   if (Next != MBB.instr_end() && Next->isCopy()) {
513     switch (MI.getOpcode()) {
514     default:
515       llvm_unreachable("unsupported opcode");
516     case TargetOpcode::G_BZERO:
517       return false;
518     case TargetOpcode::G_MEMCPY:
519     case TargetOpcode::G_MEMMOVE:
520     case TargetOpcode::G_MEMSET:
521       break;
522     }
523 
524     Register VReg = MI.getOperand(0).getReg();
525     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
526       return false;
527 
528     Register PReg = Next->getOperand(0).getReg();
529     if (!PReg.isPhysical())
530       return false;
531 
532     auto Ret = next_nodbg(Next, MBB.instr_end());
533     if (Ret == MBB.instr_end() || !Ret->isReturn())
534       return false;
535 
536     if (Ret->getNumImplicitOperands() != 1)
537       return false;
538 
539     if (PReg != Ret->getOperand(0).getReg())
540       return false;
541 
542     // Skip over the COPY that we just validated.
543     Next = Ret;
544   }
545 
546   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
547     return false;
548 
549   return true;
550 }
551 
552 LegalizerHelper::LegalizeResult
553 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
554                     const CallLowering::ArgInfo &Result,
555                     ArrayRef<CallLowering::ArgInfo> Args,
556                     const CallingConv::ID CC) {
557   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
558 
559   CallLowering::CallLoweringInfo Info;
560   Info.CallConv = CC;
561   Info.Callee = MachineOperand::CreateES(Name);
562   Info.OrigRet = Result;
563   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
564   if (!CLI.lowerCall(MIRBuilder, Info))
565     return LegalizerHelper::UnableToLegalize;
566 
567   return LegalizerHelper::Legalized;
568 }
569 
570 LegalizerHelper::LegalizeResult
571 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
572                     const CallLowering::ArgInfo &Result,
573                     ArrayRef<CallLowering::ArgInfo> Args) {
574   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
575   const char *Name = TLI.getLibcallName(Libcall);
576   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
577   return createLibcall(MIRBuilder, Name, Result, Args, CC);
578 }
579 
580 // Useful for libcalls where all operands have the same type.
581 static LegalizerHelper::LegalizeResult
582 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
583               Type *OpType) {
584   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
585 
586   // FIXME: What does the original arg index mean here?
587   SmallVector<CallLowering::ArgInfo, 3> Args;
588   for (unsigned i = 1; i < MI.getNumOperands(); i++)
589     Args.push_back({MI.getOperand(i).getReg(), OpType, 0});
590   return createLibcall(MIRBuilder, Libcall,
591                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
592 }
593 
594 LegalizerHelper::LegalizeResult
595 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
596                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
597   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
598 
599   SmallVector<CallLowering::ArgInfo, 3> Args;
600   // Add all the args, except for the last which is an imm denoting 'tail'.
601   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
602     Register Reg = MI.getOperand(i).getReg();
603 
604     // Need derive an IR type for call lowering.
605     LLT OpLLT = MRI.getType(Reg);
606     Type *OpTy = nullptr;
607     if (OpLLT.isPointer())
608       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
609     else
610       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
611     Args.push_back({Reg, OpTy, 0});
612   }
613 
614   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
615   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
616   RTLIB::Libcall RTLibcall;
617   unsigned Opc = MI.getOpcode();
618   switch (Opc) {
619   case TargetOpcode::G_BZERO:
620     RTLibcall = RTLIB::BZERO;
621     break;
622   case TargetOpcode::G_MEMCPY:
623     RTLibcall = RTLIB::MEMCPY;
624     Args[0].Flags[0].setReturned();
625     break;
626   case TargetOpcode::G_MEMMOVE:
627     RTLibcall = RTLIB::MEMMOVE;
628     Args[0].Flags[0].setReturned();
629     break;
630   case TargetOpcode::G_MEMSET:
631     RTLibcall = RTLIB::MEMSET;
632     Args[0].Flags[0].setReturned();
633     break;
634   default:
635     llvm_unreachable("unsupported opcode");
636   }
637   const char *Name = TLI.getLibcallName(RTLibcall);
638 
639   // Unsupported libcall on the target.
640   if (!Name) {
641     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
642                       << MIRBuilder.getTII().getName(Opc) << "\n");
643     return LegalizerHelper::UnableToLegalize;
644   }
645 
646   CallLowering::CallLoweringInfo Info;
647   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
648   Info.Callee = MachineOperand::CreateES(Name);
649   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
650   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
651                     isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
652 
653   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
654   if (!CLI.lowerCall(MIRBuilder, Info))
655     return LegalizerHelper::UnableToLegalize;
656 
657   if (Info.LoweredTailCall) {
658     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
659 
660     // Check debug locations before removing the return.
661     LocObserver.checkpoint(true);
662 
663     // We must have a return following the call (or debug insts) to get past
664     // isLibCallInTailPosition.
665     do {
666       MachineInstr *Next = MI.getNextNode();
667       assert(Next &&
668              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
669              "Expected instr following MI to be return or debug inst?");
670       // We lowered a tail call, so the call is now the return from the block.
671       // Delete the old return.
672       Next->eraseFromParent();
673     } while (MI.getNextNode());
674 
675     // We expect to lose the debug location from the return.
676     LocObserver.checkpoint(false);
677   }
678 
679   return LegalizerHelper::Legalized;
680 }
681 
682 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
683                                        Type *FromType) {
684   auto ToMVT = MVT::getVT(ToType);
685   auto FromMVT = MVT::getVT(FromType);
686 
687   switch (Opcode) {
688   case TargetOpcode::G_FPEXT:
689     return RTLIB::getFPEXT(FromMVT, ToMVT);
690   case TargetOpcode::G_FPTRUNC:
691     return RTLIB::getFPROUND(FromMVT, ToMVT);
692   case TargetOpcode::G_FPTOSI:
693     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
694   case TargetOpcode::G_FPTOUI:
695     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
696   case TargetOpcode::G_SITOFP:
697     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
698   case TargetOpcode::G_UITOFP:
699     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
700   }
701   llvm_unreachable("Unsupported libcall function");
702 }
703 
704 static LegalizerHelper::LegalizeResult
705 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
706                   Type *FromType) {
707   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
708   return createLibcall(MIRBuilder, Libcall,
709                        {MI.getOperand(0).getReg(), ToType, 0},
710                        {{MI.getOperand(1).getReg(), FromType, 0}});
711 }
712 
713 LegalizerHelper::LegalizeResult
714 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
715   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
716   unsigned Size = LLTy.getSizeInBits();
717   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
718 
719   switch (MI.getOpcode()) {
720   default:
721     return UnableToLegalize;
722   case TargetOpcode::G_SDIV:
723   case TargetOpcode::G_UDIV:
724   case TargetOpcode::G_SREM:
725   case TargetOpcode::G_UREM:
726   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
727     Type *HLTy = IntegerType::get(Ctx, Size);
728     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
729     if (Status != Legalized)
730       return Status;
731     break;
732   }
733   case TargetOpcode::G_FADD:
734   case TargetOpcode::G_FSUB:
735   case TargetOpcode::G_FMUL:
736   case TargetOpcode::G_FDIV:
737   case TargetOpcode::G_FMA:
738   case TargetOpcode::G_FPOW:
739   case TargetOpcode::G_FREM:
740   case TargetOpcode::G_FCOS:
741   case TargetOpcode::G_FSIN:
742   case TargetOpcode::G_FLOG10:
743   case TargetOpcode::G_FLOG:
744   case TargetOpcode::G_FLOG2:
745   case TargetOpcode::G_FEXP:
746   case TargetOpcode::G_FEXP2:
747   case TargetOpcode::G_FCEIL:
748   case TargetOpcode::G_FFLOOR:
749   case TargetOpcode::G_FMINNUM:
750   case TargetOpcode::G_FMAXNUM:
751   case TargetOpcode::G_FSQRT:
752   case TargetOpcode::G_FRINT:
753   case TargetOpcode::G_FNEARBYINT:
754   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
755     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
756     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
757       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
758       return UnableToLegalize;
759     }
760     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
761     if (Status != Legalized)
762       return Status;
763     break;
764   }
765   case TargetOpcode::G_FPEXT:
766   case TargetOpcode::G_FPTRUNC: {
767     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
768     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
769     if (!FromTy || !ToTy)
770       return UnableToLegalize;
771     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
772     if (Status != Legalized)
773       return Status;
774     break;
775   }
776   case TargetOpcode::G_FPTOSI:
777   case TargetOpcode::G_FPTOUI: {
778     // FIXME: Support other types
779     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
780     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
781     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
782       return UnableToLegalize;
783     LegalizeResult Status = conversionLibcall(
784         MI, MIRBuilder,
785         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
786         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
787     if (Status != Legalized)
788       return Status;
789     break;
790   }
791   case TargetOpcode::G_SITOFP:
792   case TargetOpcode::G_UITOFP: {
793     // FIXME: Support other types
794     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
795     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
796     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
797       return UnableToLegalize;
798     LegalizeResult Status = conversionLibcall(
799         MI, MIRBuilder,
800         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
801         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
802     if (Status != Legalized)
803       return Status;
804     break;
805   }
806   case TargetOpcode::G_BZERO:
807   case TargetOpcode::G_MEMCPY:
808   case TargetOpcode::G_MEMMOVE:
809   case TargetOpcode::G_MEMSET: {
810     LegalizeResult Result =
811         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
812     if (Result != Legalized)
813       return Result;
814     MI.eraseFromParent();
815     return Result;
816   }
817   }
818 
819   MI.eraseFromParent();
820   return Legalized;
821 }
822 
823 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
824                                                               unsigned TypeIdx,
825                                                               LLT NarrowTy) {
826   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
827   uint64_t NarrowSize = NarrowTy.getSizeInBits();
828 
829   switch (MI.getOpcode()) {
830   default:
831     return UnableToLegalize;
832   case TargetOpcode::G_IMPLICIT_DEF: {
833     Register DstReg = MI.getOperand(0).getReg();
834     LLT DstTy = MRI.getType(DstReg);
835 
836     // If SizeOp0 is not an exact multiple of NarrowSize, emit
837     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
838     // FIXME: Although this would also be legal for the general case, it causes
839     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
840     //  combines not being hit). This seems to be a problem related to the
841     //  artifact combiner.
842     if (SizeOp0 % NarrowSize != 0) {
843       LLT ImplicitTy = NarrowTy;
844       if (DstTy.isVector())
845         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
846 
847       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
848       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
849 
850       MI.eraseFromParent();
851       return Legalized;
852     }
853 
854     int NumParts = SizeOp0 / NarrowSize;
855 
856     SmallVector<Register, 2> DstRegs;
857     for (int i = 0; i < NumParts; ++i)
858       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
859 
860     if (DstTy.isVector())
861       MIRBuilder.buildBuildVector(DstReg, DstRegs);
862     else
863       MIRBuilder.buildMerge(DstReg, DstRegs);
864     MI.eraseFromParent();
865     return Legalized;
866   }
867   case TargetOpcode::G_CONSTANT: {
868     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
869     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
870     unsigned TotalSize = Ty.getSizeInBits();
871     unsigned NarrowSize = NarrowTy.getSizeInBits();
872     int NumParts = TotalSize / NarrowSize;
873 
874     SmallVector<Register, 4> PartRegs;
875     for (int I = 0; I != NumParts; ++I) {
876       unsigned Offset = I * NarrowSize;
877       auto K = MIRBuilder.buildConstant(NarrowTy,
878                                         Val.lshr(Offset).trunc(NarrowSize));
879       PartRegs.push_back(K.getReg(0));
880     }
881 
882     LLT LeftoverTy;
883     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
884     SmallVector<Register, 1> LeftoverRegs;
885     if (LeftoverBits != 0) {
886       LeftoverTy = LLT::scalar(LeftoverBits);
887       auto K = MIRBuilder.buildConstant(
888         LeftoverTy,
889         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
890       LeftoverRegs.push_back(K.getReg(0));
891     }
892 
893     insertParts(MI.getOperand(0).getReg(),
894                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
895 
896     MI.eraseFromParent();
897     return Legalized;
898   }
899   case TargetOpcode::G_SEXT:
900   case TargetOpcode::G_ZEXT:
901   case TargetOpcode::G_ANYEXT:
902     return narrowScalarExt(MI, TypeIdx, NarrowTy);
903   case TargetOpcode::G_TRUNC: {
904     if (TypeIdx != 1)
905       return UnableToLegalize;
906 
907     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
908     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
909       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
910       return UnableToLegalize;
911     }
912 
913     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
914     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
915     MI.eraseFromParent();
916     return Legalized;
917   }
918 
919   case TargetOpcode::G_FREEZE:
920     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
921   case TargetOpcode::G_ADD:
922   case TargetOpcode::G_SUB:
923   case TargetOpcode::G_SADDO:
924   case TargetOpcode::G_SSUBO:
925   case TargetOpcode::G_SADDE:
926   case TargetOpcode::G_SSUBE:
927   case TargetOpcode::G_UADDO:
928   case TargetOpcode::G_USUBO:
929   case TargetOpcode::G_UADDE:
930   case TargetOpcode::G_USUBE:
931     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
932   case TargetOpcode::G_MUL:
933   case TargetOpcode::G_UMULH:
934     return narrowScalarMul(MI, NarrowTy);
935   case TargetOpcode::G_EXTRACT:
936     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
937   case TargetOpcode::G_INSERT:
938     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
939   case TargetOpcode::G_LOAD: {
940     auto &LoadMI = cast<GLoad>(MI);
941     Register DstReg = LoadMI.getDstReg();
942     LLT DstTy = MRI.getType(DstReg);
943     if (DstTy.isVector())
944       return UnableToLegalize;
945 
946     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
947       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
948       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
949       MIRBuilder.buildAnyExt(DstReg, TmpReg);
950       LoadMI.eraseFromParent();
951       return Legalized;
952     }
953 
954     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
955   }
956   case TargetOpcode::G_ZEXTLOAD:
957   case TargetOpcode::G_SEXTLOAD: {
958     auto &LoadMI = cast<GExtLoad>(MI);
959     Register DstReg = LoadMI.getDstReg();
960     Register PtrReg = LoadMI.getPointerReg();
961 
962     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
963     auto &MMO = LoadMI.getMMO();
964     unsigned MemSize = MMO.getSizeInBits();
965 
966     if (MemSize == NarrowSize) {
967       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
968     } else if (MemSize < NarrowSize) {
969       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
970     } else if (MemSize > NarrowSize) {
971       // FIXME: Need to split the load.
972       return UnableToLegalize;
973     }
974 
975     if (isa<GZExtLoad>(LoadMI))
976       MIRBuilder.buildZExt(DstReg, TmpReg);
977     else
978       MIRBuilder.buildSExt(DstReg, TmpReg);
979 
980     LoadMI.eraseFromParent();
981     return Legalized;
982   }
983   case TargetOpcode::G_STORE: {
984     auto &StoreMI = cast<GStore>(MI);
985 
986     Register SrcReg = StoreMI.getValueReg();
987     LLT SrcTy = MRI.getType(SrcReg);
988     if (SrcTy.isVector())
989       return UnableToLegalize;
990 
991     int NumParts = SizeOp0 / NarrowSize;
992     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
993     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
994     if (SrcTy.isVector() && LeftoverBits != 0)
995       return UnableToLegalize;
996 
997     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
998       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
999       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1000       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1001       StoreMI.eraseFromParent();
1002       return Legalized;
1003     }
1004 
1005     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1006   }
1007   case TargetOpcode::G_SELECT:
1008     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1009   case TargetOpcode::G_AND:
1010   case TargetOpcode::G_OR:
1011   case TargetOpcode::G_XOR: {
1012     // Legalize bitwise operation:
1013     // A = BinOp<Ty> B, C
1014     // into:
1015     // B1, ..., BN = G_UNMERGE_VALUES B
1016     // C1, ..., CN = G_UNMERGE_VALUES C
1017     // A1 = BinOp<Ty/N> B1, C2
1018     // ...
1019     // AN = BinOp<Ty/N> BN, CN
1020     // A = G_MERGE_VALUES A1, ..., AN
1021     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1022   }
1023   case TargetOpcode::G_SHL:
1024   case TargetOpcode::G_LSHR:
1025   case TargetOpcode::G_ASHR:
1026     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1027   case TargetOpcode::G_CTLZ:
1028   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1029   case TargetOpcode::G_CTTZ:
1030   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1031   case TargetOpcode::G_CTPOP:
1032     if (TypeIdx == 1)
1033       switch (MI.getOpcode()) {
1034       case TargetOpcode::G_CTLZ:
1035       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1036         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1037       case TargetOpcode::G_CTTZ:
1038       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1039         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1040       case TargetOpcode::G_CTPOP:
1041         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1042       default:
1043         return UnableToLegalize;
1044       }
1045 
1046     Observer.changingInstr(MI);
1047     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1048     Observer.changedInstr(MI);
1049     return Legalized;
1050   case TargetOpcode::G_INTTOPTR:
1051     if (TypeIdx != 1)
1052       return UnableToLegalize;
1053 
1054     Observer.changingInstr(MI);
1055     narrowScalarSrc(MI, NarrowTy, 1);
1056     Observer.changedInstr(MI);
1057     return Legalized;
1058   case TargetOpcode::G_PTRTOINT:
1059     if (TypeIdx != 0)
1060       return UnableToLegalize;
1061 
1062     Observer.changingInstr(MI);
1063     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1064     Observer.changedInstr(MI);
1065     return Legalized;
1066   case TargetOpcode::G_PHI: {
1067     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1068     // NarrowSize.
1069     if (SizeOp0 % NarrowSize != 0)
1070       return UnableToLegalize;
1071 
1072     unsigned NumParts = SizeOp0 / NarrowSize;
1073     SmallVector<Register, 2> DstRegs(NumParts);
1074     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1075     Observer.changingInstr(MI);
1076     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1077       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1078       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1079       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1080                    SrcRegs[i / 2]);
1081     }
1082     MachineBasicBlock &MBB = *MI.getParent();
1083     MIRBuilder.setInsertPt(MBB, MI);
1084     for (unsigned i = 0; i < NumParts; ++i) {
1085       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1086       MachineInstrBuilder MIB =
1087           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1088       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1089         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1090     }
1091     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1092     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1093     Observer.changedInstr(MI);
1094     MI.eraseFromParent();
1095     return Legalized;
1096   }
1097   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1098   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1099     if (TypeIdx != 2)
1100       return UnableToLegalize;
1101 
1102     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1103     Observer.changingInstr(MI);
1104     narrowScalarSrc(MI, NarrowTy, OpIdx);
1105     Observer.changedInstr(MI);
1106     return Legalized;
1107   }
1108   case TargetOpcode::G_ICMP: {
1109     Register LHS = MI.getOperand(2).getReg();
1110     LLT SrcTy = MRI.getType(LHS);
1111     uint64_t SrcSize = SrcTy.getSizeInBits();
1112     CmpInst::Predicate Pred =
1113         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1114 
1115     // TODO: Handle the non-equality case for weird sizes.
1116     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1117       return UnableToLegalize;
1118 
1119     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1120     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1121     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1122                       LHSLeftoverRegs))
1123       return UnableToLegalize;
1124 
1125     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1126     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1127     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1128                       RHSPartRegs, RHSLeftoverRegs))
1129       return UnableToLegalize;
1130 
1131     // We now have the LHS and RHS of the compare split into narrow-type
1132     // registers, plus potentially some leftover type.
1133     Register Dst = MI.getOperand(0).getReg();
1134     LLT ResTy = MRI.getType(Dst);
1135     if (ICmpInst::isEquality(Pred)) {
1136       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1137       // them together. For each equal part, the result should be all 0s. For
1138       // each non-equal part, we'll get at least one 1.
1139       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1140       SmallVector<Register, 4> Xors;
1141       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1142         auto LHS = std::get<0>(LHSAndRHS);
1143         auto RHS = std::get<1>(LHSAndRHS);
1144         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1145         Xors.push_back(Xor);
1146       }
1147 
1148       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1149       // to the desired narrow type so that we can OR them together later.
1150       SmallVector<Register, 4> WidenedXors;
1151       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1152         auto LHS = std::get<0>(LHSAndRHS);
1153         auto RHS = std::get<1>(LHSAndRHS);
1154         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1155         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1156         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1157                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1158         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1159       }
1160 
1161       // Now, for each part we broke up, we know if they are equal/not equal
1162       // based off the G_XOR. We can OR these all together and compare against
1163       // 0 to get the result.
1164       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1165       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1166       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1167         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1168       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1169     } else {
1170       // TODO: Handle non-power-of-two types.
1171       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1172       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1173       Register LHSL = LHSPartRegs[0];
1174       Register LHSH = LHSPartRegs[1];
1175       Register RHSL = RHSPartRegs[0];
1176       Register RHSH = RHSPartRegs[1];
1177       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1178       MachineInstrBuilder CmpHEQ =
1179           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1180       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1181           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1182       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1183     }
1184     MI.eraseFromParent();
1185     return Legalized;
1186   }
1187   case TargetOpcode::G_SEXT_INREG: {
1188     if (TypeIdx != 0)
1189       return UnableToLegalize;
1190 
1191     int64_t SizeInBits = MI.getOperand(2).getImm();
1192 
1193     // So long as the new type has more bits than the bits we're extending we
1194     // don't need to break it apart.
1195     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1196       Observer.changingInstr(MI);
1197       // We don't lose any non-extension bits by truncating the src and
1198       // sign-extending the dst.
1199       MachineOperand &MO1 = MI.getOperand(1);
1200       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1201       MO1.setReg(TruncMIB.getReg(0));
1202 
1203       MachineOperand &MO2 = MI.getOperand(0);
1204       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1205       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1206       MIRBuilder.buildSExt(MO2, DstExt);
1207       MO2.setReg(DstExt);
1208       Observer.changedInstr(MI);
1209       return Legalized;
1210     }
1211 
1212     // Break it apart. Components below the extension point are unmodified. The
1213     // component containing the extension point becomes a narrower SEXT_INREG.
1214     // Components above it are ashr'd from the component containing the
1215     // extension point.
1216     if (SizeOp0 % NarrowSize != 0)
1217       return UnableToLegalize;
1218     int NumParts = SizeOp0 / NarrowSize;
1219 
1220     // List the registers where the destination will be scattered.
1221     SmallVector<Register, 2> DstRegs;
1222     // List the registers where the source will be split.
1223     SmallVector<Register, 2> SrcRegs;
1224 
1225     // Create all the temporary registers.
1226     for (int i = 0; i < NumParts; ++i) {
1227       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1228 
1229       SrcRegs.push_back(SrcReg);
1230     }
1231 
1232     // Explode the big arguments into smaller chunks.
1233     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1234 
1235     Register AshrCstReg =
1236         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1237             .getReg(0);
1238     Register FullExtensionReg = 0;
1239     Register PartialExtensionReg = 0;
1240 
1241     // Do the operation on each small part.
1242     for (int i = 0; i < NumParts; ++i) {
1243       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1244         DstRegs.push_back(SrcRegs[i]);
1245       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1246         assert(PartialExtensionReg &&
1247                "Expected to visit partial extension before full");
1248         if (FullExtensionReg) {
1249           DstRegs.push_back(FullExtensionReg);
1250           continue;
1251         }
1252         DstRegs.push_back(
1253             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1254                 .getReg(0));
1255         FullExtensionReg = DstRegs.back();
1256       } else {
1257         DstRegs.push_back(
1258             MIRBuilder
1259                 .buildInstr(
1260                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1261                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1262                 .getReg(0));
1263         PartialExtensionReg = DstRegs.back();
1264       }
1265     }
1266 
1267     // Gather the destination registers into the final destination.
1268     Register DstReg = MI.getOperand(0).getReg();
1269     MIRBuilder.buildMerge(DstReg, DstRegs);
1270     MI.eraseFromParent();
1271     return Legalized;
1272   }
1273   case TargetOpcode::G_BSWAP:
1274   case TargetOpcode::G_BITREVERSE: {
1275     if (SizeOp0 % NarrowSize != 0)
1276       return UnableToLegalize;
1277 
1278     Observer.changingInstr(MI);
1279     SmallVector<Register, 2> SrcRegs, DstRegs;
1280     unsigned NumParts = SizeOp0 / NarrowSize;
1281     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1282 
1283     for (unsigned i = 0; i < NumParts; ++i) {
1284       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1285                                            {SrcRegs[NumParts - 1 - i]});
1286       DstRegs.push_back(DstPart.getReg(0));
1287     }
1288 
1289     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1290 
1291     Observer.changedInstr(MI);
1292     MI.eraseFromParent();
1293     return Legalized;
1294   }
1295   case TargetOpcode::G_PTR_ADD:
1296   case TargetOpcode::G_PTRMASK: {
1297     if (TypeIdx != 1)
1298       return UnableToLegalize;
1299     Observer.changingInstr(MI);
1300     narrowScalarSrc(MI, NarrowTy, 2);
1301     Observer.changedInstr(MI);
1302     return Legalized;
1303   }
1304   case TargetOpcode::G_FPTOUI:
1305   case TargetOpcode::G_FPTOSI:
1306     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1307   case TargetOpcode::G_FPEXT:
1308     if (TypeIdx != 0)
1309       return UnableToLegalize;
1310     Observer.changingInstr(MI);
1311     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1312     Observer.changedInstr(MI);
1313     return Legalized;
1314   }
1315 }
1316 
1317 Register LegalizerHelper::coerceToScalar(Register Val) {
1318   LLT Ty = MRI.getType(Val);
1319   if (Ty.isScalar())
1320     return Val;
1321 
1322   const DataLayout &DL = MIRBuilder.getDataLayout();
1323   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1324   if (Ty.isPointer()) {
1325     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1326       return Register();
1327     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1328   }
1329 
1330   Register NewVal = Val;
1331 
1332   assert(Ty.isVector());
1333   LLT EltTy = Ty.getElementType();
1334   if (EltTy.isPointer())
1335     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1336   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1337 }
1338 
1339 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1340                                      unsigned OpIdx, unsigned ExtOpcode) {
1341   MachineOperand &MO = MI.getOperand(OpIdx);
1342   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1343   MO.setReg(ExtB.getReg(0));
1344 }
1345 
1346 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1347                                       unsigned OpIdx) {
1348   MachineOperand &MO = MI.getOperand(OpIdx);
1349   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1350   MO.setReg(ExtB.getReg(0));
1351 }
1352 
1353 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1354                                      unsigned OpIdx, unsigned TruncOpcode) {
1355   MachineOperand &MO = MI.getOperand(OpIdx);
1356   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1357   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1358   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1359   MO.setReg(DstExt);
1360 }
1361 
1362 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1363                                       unsigned OpIdx, unsigned ExtOpcode) {
1364   MachineOperand &MO = MI.getOperand(OpIdx);
1365   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1366   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1367   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1368   MO.setReg(DstTrunc);
1369 }
1370 
1371 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1372                                             unsigned OpIdx) {
1373   MachineOperand &MO = MI.getOperand(OpIdx);
1374   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1375   MO.setReg(widenWithUnmerge(WideTy, MO.getReg()));
1376 }
1377 
1378 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1379                                             unsigned OpIdx) {
1380   MachineOperand &MO = MI.getOperand(OpIdx);
1381 
1382   LLT OldTy = MRI.getType(MO.getReg());
1383   unsigned OldElts = OldTy.getNumElements();
1384   unsigned NewElts = MoreTy.getNumElements();
1385 
1386   unsigned NumParts = NewElts / OldElts;
1387 
1388   // Use concat_vectors if the result is a multiple of the number of elements.
1389   if (NumParts * OldElts == NewElts) {
1390     SmallVector<Register, 8> Parts;
1391     Parts.push_back(MO.getReg());
1392 
1393     Register ImpDef = MIRBuilder.buildUndef(OldTy).getReg(0);
1394     for (unsigned I = 1; I != NumParts; ++I)
1395       Parts.push_back(ImpDef);
1396 
1397     auto Concat = MIRBuilder.buildConcatVectors(MoreTy, Parts);
1398     MO.setReg(Concat.getReg(0));
1399     return;
1400   }
1401 
1402   Register MoreReg = MRI.createGenericVirtualRegister(MoreTy);
1403   Register ImpDef = MIRBuilder.buildUndef(MoreTy).getReg(0);
1404   MIRBuilder.buildInsert(MoreReg, ImpDef, MO.getReg(), 0);
1405   MO.setReg(MoreReg);
1406 }
1407 
1408 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1409   MachineOperand &Op = MI.getOperand(OpIdx);
1410   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1411 }
1412 
1413 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1414   MachineOperand &MO = MI.getOperand(OpIdx);
1415   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1416   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1417   MIRBuilder.buildBitcast(MO, CastDst);
1418   MO.setReg(CastDst);
1419 }
1420 
1421 LegalizerHelper::LegalizeResult
1422 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1423                                         LLT WideTy) {
1424   if (TypeIdx != 1)
1425     return UnableToLegalize;
1426 
1427   Register DstReg = MI.getOperand(0).getReg();
1428   LLT DstTy = MRI.getType(DstReg);
1429   if (DstTy.isVector())
1430     return UnableToLegalize;
1431 
1432   Register Src1 = MI.getOperand(1).getReg();
1433   LLT SrcTy = MRI.getType(Src1);
1434   const int DstSize = DstTy.getSizeInBits();
1435   const int SrcSize = SrcTy.getSizeInBits();
1436   const int WideSize = WideTy.getSizeInBits();
1437   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1438 
1439   unsigned NumOps = MI.getNumOperands();
1440   unsigned NumSrc = MI.getNumOperands() - 1;
1441   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1442 
1443   if (WideSize >= DstSize) {
1444     // Directly pack the bits in the target type.
1445     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1446 
1447     for (unsigned I = 2; I != NumOps; ++I) {
1448       const unsigned Offset = (I - 1) * PartSize;
1449 
1450       Register SrcReg = MI.getOperand(I).getReg();
1451       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1452 
1453       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1454 
1455       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1456         MRI.createGenericVirtualRegister(WideTy);
1457 
1458       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1459       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1460       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1461       ResultReg = NextResult;
1462     }
1463 
1464     if (WideSize > DstSize)
1465       MIRBuilder.buildTrunc(DstReg, ResultReg);
1466     else if (DstTy.isPointer())
1467       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1468 
1469     MI.eraseFromParent();
1470     return Legalized;
1471   }
1472 
1473   // Unmerge the original values to the GCD type, and recombine to the next
1474   // multiple greater than the original type.
1475   //
1476   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1477   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1478   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1479   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1480   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1481   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1482   // %12:_(s12) = G_MERGE_VALUES %10, %11
1483   //
1484   // Padding with undef if necessary:
1485   //
1486   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1487   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1488   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1489   // %7:_(s2) = G_IMPLICIT_DEF
1490   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1491   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1492   // %10:_(s12) = G_MERGE_VALUES %8, %9
1493 
1494   const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1495   LLT GCDTy = LLT::scalar(GCD);
1496 
1497   SmallVector<Register, 8> Parts;
1498   SmallVector<Register, 8> NewMergeRegs;
1499   SmallVector<Register, 8> Unmerges;
1500   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1501 
1502   // Decompose the original operands if they don't evenly divide.
1503   for (int I = 1, E = MI.getNumOperands(); I != E; ++I) {
1504     Register SrcReg = MI.getOperand(I).getReg();
1505     if (GCD == SrcSize) {
1506       Unmerges.push_back(SrcReg);
1507     } else {
1508       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1509       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1510         Unmerges.push_back(Unmerge.getReg(J));
1511     }
1512   }
1513 
1514   // Pad with undef to the next size that is a multiple of the requested size.
1515   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1516     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1517     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1518       Unmerges.push_back(UndefReg);
1519   }
1520 
1521   const int PartsPerGCD = WideSize / GCD;
1522 
1523   // Build merges of each piece.
1524   ArrayRef<Register> Slicer(Unmerges);
1525   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1526     auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1527     NewMergeRegs.push_back(Merge.getReg(0));
1528   }
1529 
1530   // A truncate may be necessary if the requested type doesn't evenly divide the
1531   // original result type.
1532   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1533     MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1534   } else {
1535     auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1536     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1537   }
1538 
1539   MI.eraseFromParent();
1540   return Legalized;
1541 }
1542 
1543 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
1544   Register WideReg = MRI.createGenericVirtualRegister(WideTy);
1545   LLT OrigTy = MRI.getType(OrigReg);
1546   LLT LCMTy = getLCMType(WideTy, OrigTy);
1547 
1548   const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
1549   const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
1550 
1551   Register UnmergeSrc = WideReg;
1552 
1553   // Create a merge to the LCM type, padding with undef
1554   // %0:_(<3 x s32>) = G_FOO => <4 x s32>
1555   // =>
1556   // %1:_(<4 x s32>) = G_FOO
1557   // %2:_(<4 x s32>) = G_IMPLICIT_DEF
1558   // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
1559   // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
1560   if (NumMergeParts > 1) {
1561     Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
1562     SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
1563     MergeParts[0] = WideReg;
1564     UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
1565   }
1566 
1567   // Unmerge to the original register and pad with dead defs.
1568   SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
1569   UnmergeResults[0] = OrigReg;
1570   for (int I = 1; I != NumUnmergeParts; ++I)
1571     UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
1572 
1573   MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
1574   return WideReg;
1575 }
1576 
1577 LegalizerHelper::LegalizeResult
1578 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1579                                           LLT WideTy) {
1580   if (TypeIdx != 0)
1581     return UnableToLegalize;
1582 
1583   int NumDst = MI.getNumOperands() - 1;
1584   Register SrcReg = MI.getOperand(NumDst).getReg();
1585   LLT SrcTy = MRI.getType(SrcReg);
1586   if (SrcTy.isVector())
1587     return UnableToLegalize;
1588 
1589   Register Dst0Reg = MI.getOperand(0).getReg();
1590   LLT DstTy = MRI.getType(Dst0Reg);
1591   if (!DstTy.isScalar())
1592     return UnableToLegalize;
1593 
1594   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1595     if (SrcTy.isPointer()) {
1596       const DataLayout &DL = MIRBuilder.getDataLayout();
1597       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1598         LLVM_DEBUG(
1599             dbgs() << "Not casting non-integral address space integer\n");
1600         return UnableToLegalize;
1601       }
1602 
1603       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1604       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1605     }
1606 
1607     // Widen SrcTy to WideTy. This does not affect the result, but since the
1608     // user requested this size, it is probably better handled than SrcTy and
1609     // should reduce the total number of legalization artifacts
1610     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1611       SrcTy = WideTy;
1612       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1613     }
1614 
1615     // Theres no unmerge type to target. Directly extract the bits from the
1616     // source type
1617     unsigned DstSize = DstTy.getSizeInBits();
1618 
1619     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1620     for (int I = 1; I != NumDst; ++I) {
1621       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1622       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1623       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1624     }
1625 
1626     MI.eraseFromParent();
1627     return Legalized;
1628   }
1629 
1630   // Extend the source to a wider type.
1631   LLT LCMTy = getLCMType(SrcTy, WideTy);
1632 
1633   Register WideSrc = SrcReg;
1634   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1635     // TODO: If this is an integral address space, cast to integer and anyext.
1636     if (SrcTy.isPointer()) {
1637       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1638       return UnableToLegalize;
1639     }
1640 
1641     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1642   }
1643 
1644   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1645 
1646   // Create a sequence of unmerges and merges to the original results. Since we
1647   // may have widened the source, we will need to pad the results with dead defs
1648   // to cover the source register.
1649   // e.g. widen s48 to s64:
1650   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1651   //
1652   // =>
1653   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1654   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1655   //  ; unpack to GCD type, with extra dead defs
1656   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1657   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1658   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1659   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1660   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1661   const LLT GCDTy = getGCDType(WideTy, DstTy);
1662   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1663   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1664 
1665   // Directly unmerge to the destination without going through a GCD type
1666   // if possible
1667   if (PartsPerRemerge == 1) {
1668     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1669 
1670     for (int I = 0; I != NumUnmerge; ++I) {
1671       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1672 
1673       for (int J = 0; J != PartsPerUnmerge; ++J) {
1674         int Idx = I * PartsPerUnmerge + J;
1675         if (Idx < NumDst)
1676           MIB.addDef(MI.getOperand(Idx).getReg());
1677         else {
1678           // Create dead def for excess components.
1679           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1680         }
1681       }
1682 
1683       MIB.addUse(Unmerge.getReg(I));
1684     }
1685   } else {
1686     SmallVector<Register, 16> Parts;
1687     for (int J = 0; J != NumUnmerge; ++J)
1688       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1689 
1690     SmallVector<Register, 8> RemergeParts;
1691     for (int I = 0; I != NumDst; ++I) {
1692       for (int J = 0; J < PartsPerRemerge; ++J) {
1693         const int Idx = I * PartsPerRemerge + J;
1694         RemergeParts.emplace_back(Parts[Idx]);
1695       }
1696 
1697       MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1698       RemergeParts.clear();
1699     }
1700   }
1701 
1702   MI.eraseFromParent();
1703   return Legalized;
1704 }
1705 
1706 LegalizerHelper::LegalizeResult
1707 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1708                                     LLT WideTy) {
1709   Register DstReg = MI.getOperand(0).getReg();
1710   Register SrcReg = MI.getOperand(1).getReg();
1711   LLT SrcTy = MRI.getType(SrcReg);
1712 
1713   LLT DstTy = MRI.getType(DstReg);
1714   unsigned Offset = MI.getOperand(2).getImm();
1715 
1716   if (TypeIdx == 0) {
1717     if (SrcTy.isVector() || DstTy.isVector())
1718       return UnableToLegalize;
1719 
1720     SrcOp Src(SrcReg);
1721     if (SrcTy.isPointer()) {
1722       // Extracts from pointers can be handled only if they are really just
1723       // simple integers.
1724       const DataLayout &DL = MIRBuilder.getDataLayout();
1725       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1726         return UnableToLegalize;
1727 
1728       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1729       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1730       SrcTy = SrcAsIntTy;
1731     }
1732 
1733     if (DstTy.isPointer())
1734       return UnableToLegalize;
1735 
1736     if (Offset == 0) {
1737       // Avoid a shift in the degenerate case.
1738       MIRBuilder.buildTrunc(DstReg,
1739                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1740       MI.eraseFromParent();
1741       return Legalized;
1742     }
1743 
1744     // Do a shift in the source type.
1745     LLT ShiftTy = SrcTy;
1746     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1747       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1748       ShiftTy = WideTy;
1749     }
1750 
1751     auto LShr = MIRBuilder.buildLShr(
1752       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1753     MIRBuilder.buildTrunc(DstReg, LShr);
1754     MI.eraseFromParent();
1755     return Legalized;
1756   }
1757 
1758   if (SrcTy.isScalar()) {
1759     Observer.changingInstr(MI);
1760     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1761     Observer.changedInstr(MI);
1762     return Legalized;
1763   }
1764 
1765   if (!SrcTy.isVector())
1766     return UnableToLegalize;
1767 
1768   if (DstTy != SrcTy.getElementType())
1769     return UnableToLegalize;
1770 
1771   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1772     return UnableToLegalize;
1773 
1774   Observer.changingInstr(MI);
1775   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1776 
1777   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1778                           Offset);
1779   widenScalarDst(MI, WideTy.getScalarType(), 0);
1780   Observer.changedInstr(MI);
1781   return Legalized;
1782 }
1783 
1784 LegalizerHelper::LegalizeResult
1785 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1786                                    LLT WideTy) {
1787   if (TypeIdx != 0 || WideTy.isVector())
1788     return UnableToLegalize;
1789   Observer.changingInstr(MI);
1790   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1791   widenScalarDst(MI, WideTy);
1792   Observer.changedInstr(MI);
1793   return Legalized;
1794 }
1795 
1796 LegalizerHelper::LegalizeResult
1797 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1798                                            LLT WideTy) {
1799   if (TypeIdx == 1)
1800     return UnableToLegalize; // TODO
1801 
1802   unsigned Opcode;
1803   unsigned ExtOpcode;
1804   Optional<Register> CarryIn = None;
1805   switch (MI.getOpcode()) {
1806   default:
1807     llvm_unreachable("Unexpected opcode!");
1808   case TargetOpcode::G_SADDO:
1809     Opcode = TargetOpcode::G_ADD;
1810     ExtOpcode = TargetOpcode::G_SEXT;
1811     break;
1812   case TargetOpcode::G_SSUBO:
1813     Opcode = TargetOpcode::G_SUB;
1814     ExtOpcode = TargetOpcode::G_SEXT;
1815     break;
1816   case TargetOpcode::G_UADDO:
1817     Opcode = TargetOpcode::G_ADD;
1818     ExtOpcode = TargetOpcode::G_ZEXT;
1819     break;
1820   case TargetOpcode::G_USUBO:
1821     Opcode = TargetOpcode::G_SUB;
1822     ExtOpcode = TargetOpcode::G_ZEXT;
1823     break;
1824   case TargetOpcode::G_SADDE:
1825     Opcode = TargetOpcode::G_UADDE;
1826     ExtOpcode = TargetOpcode::G_SEXT;
1827     CarryIn = MI.getOperand(4).getReg();
1828     break;
1829   case TargetOpcode::G_SSUBE:
1830     Opcode = TargetOpcode::G_USUBE;
1831     ExtOpcode = TargetOpcode::G_SEXT;
1832     CarryIn = MI.getOperand(4).getReg();
1833     break;
1834   case TargetOpcode::G_UADDE:
1835     Opcode = TargetOpcode::G_UADDE;
1836     ExtOpcode = TargetOpcode::G_ZEXT;
1837     CarryIn = MI.getOperand(4).getReg();
1838     break;
1839   case TargetOpcode::G_USUBE:
1840     Opcode = TargetOpcode::G_USUBE;
1841     ExtOpcode = TargetOpcode::G_ZEXT;
1842     CarryIn = MI.getOperand(4).getReg();
1843     break;
1844   }
1845 
1846   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1847   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1848   // Do the arithmetic in the larger type.
1849   Register NewOp;
1850   if (CarryIn) {
1851     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1852     NewOp = MIRBuilder
1853                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1854                             {LHSExt, RHSExt, *CarryIn})
1855                 .getReg(0);
1856   } else {
1857     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1858   }
1859   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1860   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1861   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1862   // There is no overflow if the ExtOp is the same as NewOp.
1863   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1864   // Now trunc the NewOp to the original result.
1865   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1866   MI.eraseFromParent();
1867   return Legalized;
1868 }
1869 
1870 LegalizerHelper::LegalizeResult
1871 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1872                                          LLT WideTy) {
1873   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1874                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1875                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1876   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1877                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1878   // We can convert this to:
1879   //   1. Any extend iN to iM
1880   //   2. SHL by M-N
1881   //   3. [US][ADD|SUB|SHL]SAT
1882   //   4. L/ASHR by M-N
1883   //
1884   // It may be more efficient to lower this to a min and a max operation in
1885   // the higher precision arithmetic if the promoted operation isn't legal,
1886   // but this decision is up to the target's lowering request.
1887   Register DstReg = MI.getOperand(0).getReg();
1888 
1889   unsigned NewBits = WideTy.getScalarSizeInBits();
1890   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1891 
1892   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1893   // must not left shift the RHS to preserve the shift amount.
1894   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1895   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1896                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1897   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1898   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1899   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1900 
1901   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1902                                         {ShiftL, ShiftR}, MI.getFlags());
1903 
1904   // Use a shift that will preserve the number of sign bits when the trunc is
1905   // folded away.
1906   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1907                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1908 
1909   MIRBuilder.buildTrunc(DstReg, Result);
1910   MI.eraseFromParent();
1911   return Legalized;
1912 }
1913 
1914 LegalizerHelper::LegalizeResult
1915 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1916                                  LLT WideTy) {
1917   if (TypeIdx == 1)
1918     return UnableToLegalize;
1919 
1920   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1921   Register Result = MI.getOperand(0).getReg();
1922   Register OriginalOverflow = MI.getOperand(1).getReg();
1923   Register LHS = MI.getOperand(2).getReg();
1924   Register RHS = MI.getOperand(3).getReg();
1925   LLT SrcTy = MRI.getType(LHS);
1926   LLT OverflowTy = MRI.getType(OriginalOverflow);
1927   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1928 
1929   // To determine if the result overflowed in the larger type, we extend the
1930   // input to the larger type, do the multiply (checking if it overflows),
1931   // then also check the high bits of the result to see if overflow happened
1932   // there.
1933   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
1934   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
1935   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
1936 
1937   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
1938                                     {LeftOperand, RightOperand});
1939   auto Mul = Mulo->getOperand(0);
1940   MIRBuilder.buildTrunc(Result, Mul);
1941 
1942   MachineInstrBuilder ExtResult;
1943   // Overflow occurred if it occurred in the larger type, or if the high part
1944   // of the result does not zero/sign-extend the low part.  Check this second
1945   // possibility first.
1946   if (IsSigned) {
1947     // For signed, overflow occurred when the high part does not sign-extend
1948     // the low part.
1949     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
1950   } else {
1951     // Unsigned overflow occurred when the high part does not zero-extend the
1952     // low part.
1953     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
1954   }
1955 
1956   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
1957   // so we don't need to check the overflow result of larger type Mulo.
1958   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
1959     auto Overflow =
1960         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
1961     // Finally check if the multiplication in the larger type itself overflowed.
1962     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
1963   } else {
1964     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
1965   }
1966   MI.eraseFromParent();
1967   return Legalized;
1968 }
1969 
1970 LegalizerHelper::LegalizeResult
1971 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
1972   switch (MI.getOpcode()) {
1973   default:
1974     return UnableToLegalize;
1975   case TargetOpcode::G_ATOMICRMW_XCHG:
1976   case TargetOpcode::G_ATOMICRMW_ADD:
1977   case TargetOpcode::G_ATOMICRMW_SUB:
1978   case TargetOpcode::G_ATOMICRMW_AND:
1979   case TargetOpcode::G_ATOMICRMW_OR:
1980   case TargetOpcode::G_ATOMICRMW_XOR:
1981   case TargetOpcode::G_ATOMICRMW_MIN:
1982   case TargetOpcode::G_ATOMICRMW_MAX:
1983   case TargetOpcode::G_ATOMICRMW_UMIN:
1984   case TargetOpcode::G_ATOMICRMW_UMAX:
1985     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
1986     Observer.changingInstr(MI);
1987     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
1988     widenScalarDst(MI, WideTy, 0);
1989     Observer.changedInstr(MI);
1990     return Legalized;
1991   case TargetOpcode::G_ATOMIC_CMPXCHG:
1992     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
1993     Observer.changingInstr(MI);
1994     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
1995     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
1996     widenScalarDst(MI, WideTy, 0);
1997     Observer.changedInstr(MI);
1998     return Legalized;
1999   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2000     if (TypeIdx == 0) {
2001       Observer.changingInstr(MI);
2002       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2003       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2004       widenScalarDst(MI, WideTy, 0);
2005       Observer.changedInstr(MI);
2006       return Legalized;
2007     }
2008     assert(TypeIdx == 1 &&
2009            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2010     Observer.changingInstr(MI);
2011     widenScalarDst(MI, WideTy, 1);
2012     Observer.changedInstr(MI);
2013     return Legalized;
2014   case TargetOpcode::G_EXTRACT:
2015     return widenScalarExtract(MI, TypeIdx, WideTy);
2016   case TargetOpcode::G_INSERT:
2017     return widenScalarInsert(MI, TypeIdx, WideTy);
2018   case TargetOpcode::G_MERGE_VALUES:
2019     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2020   case TargetOpcode::G_UNMERGE_VALUES:
2021     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2022   case TargetOpcode::G_SADDO:
2023   case TargetOpcode::G_SSUBO:
2024   case TargetOpcode::G_UADDO:
2025   case TargetOpcode::G_USUBO:
2026   case TargetOpcode::G_SADDE:
2027   case TargetOpcode::G_SSUBE:
2028   case TargetOpcode::G_UADDE:
2029   case TargetOpcode::G_USUBE:
2030     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2031   case TargetOpcode::G_UMULO:
2032   case TargetOpcode::G_SMULO:
2033     return widenScalarMulo(MI, TypeIdx, WideTy);
2034   case TargetOpcode::G_SADDSAT:
2035   case TargetOpcode::G_SSUBSAT:
2036   case TargetOpcode::G_SSHLSAT:
2037   case TargetOpcode::G_UADDSAT:
2038   case TargetOpcode::G_USUBSAT:
2039   case TargetOpcode::G_USHLSAT:
2040     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2041   case TargetOpcode::G_CTTZ:
2042   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2043   case TargetOpcode::G_CTLZ:
2044   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2045   case TargetOpcode::G_CTPOP: {
2046     if (TypeIdx == 0) {
2047       Observer.changingInstr(MI);
2048       widenScalarDst(MI, WideTy, 0);
2049       Observer.changedInstr(MI);
2050       return Legalized;
2051     }
2052 
2053     Register SrcReg = MI.getOperand(1).getReg();
2054 
2055     // First extend the input.
2056     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2057                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2058                           ? TargetOpcode::G_ANYEXT
2059                           : TargetOpcode::G_ZEXT;
2060     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2061     LLT CurTy = MRI.getType(SrcReg);
2062     unsigned NewOpc = MI.getOpcode();
2063     if (NewOpc == TargetOpcode::G_CTTZ) {
2064       // The count is the same in the larger type except if the original
2065       // value was zero.  This can be handled by setting the bit just off
2066       // the top of the original type.
2067       auto TopBit =
2068           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2069       MIBSrc = MIRBuilder.buildOr(
2070         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2071       // Now we know the operand is non-zero, use the more relaxed opcode.
2072       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2073     }
2074 
2075     // Perform the operation at the larger size.
2076     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2077     // This is already the correct result for CTPOP and CTTZs
2078     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2079         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2080       // The correct result is NewOp - (Difference in widety and current ty).
2081       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2082       MIBNewOp = MIRBuilder.buildSub(
2083           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2084     }
2085 
2086     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2087     MI.eraseFromParent();
2088     return Legalized;
2089   }
2090   case TargetOpcode::G_BSWAP: {
2091     Observer.changingInstr(MI);
2092     Register DstReg = MI.getOperand(0).getReg();
2093 
2094     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2095     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2096     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2097     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2098 
2099     MI.getOperand(0).setReg(DstExt);
2100 
2101     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2102 
2103     LLT Ty = MRI.getType(DstReg);
2104     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2105     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2106     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2107 
2108     MIRBuilder.buildTrunc(DstReg, ShrReg);
2109     Observer.changedInstr(MI);
2110     return Legalized;
2111   }
2112   case TargetOpcode::G_BITREVERSE: {
2113     Observer.changingInstr(MI);
2114 
2115     Register DstReg = MI.getOperand(0).getReg();
2116     LLT Ty = MRI.getType(DstReg);
2117     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2118 
2119     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2120     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2121     MI.getOperand(0).setReg(DstExt);
2122     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2123 
2124     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2125     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2126     MIRBuilder.buildTrunc(DstReg, Shift);
2127     Observer.changedInstr(MI);
2128     return Legalized;
2129   }
2130   case TargetOpcode::G_FREEZE:
2131     Observer.changingInstr(MI);
2132     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2133     widenScalarDst(MI, WideTy);
2134     Observer.changedInstr(MI);
2135     return Legalized;
2136 
2137   case TargetOpcode::G_ABS:
2138     Observer.changingInstr(MI);
2139     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2140     widenScalarDst(MI, WideTy);
2141     Observer.changedInstr(MI);
2142     return Legalized;
2143 
2144   case TargetOpcode::G_ADD:
2145   case TargetOpcode::G_AND:
2146   case TargetOpcode::G_MUL:
2147   case TargetOpcode::G_OR:
2148   case TargetOpcode::G_XOR:
2149   case TargetOpcode::G_SUB:
2150     // Perform operation at larger width (any extension is fines here, high bits
2151     // don't affect the result) and then truncate the result back to the
2152     // original type.
2153     Observer.changingInstr(MI);
2154     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2155     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2156     widenScalarDst(MI, WideTy);
2157     Observer.changedInstr(MI);
2158     return Legalized;
2159 
2160   case TargetOpcode::G_SBFX:
2161   case TargetOpcode::G_UBFX:
2162     Observer.changingInstr(MI);
2163 
2164     if (TypeIdx == 0) {
2165       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2166       widenScalarDst(MI, WideTy);
2167     } else {
2168       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2169       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2170     }
2171 
2172     Observer.changedInstr(MI);
2173     return Legalized;
2174 
2175   case TargetOpcode::G_SHL:
2176     Observer.changingInstr(MI);
2177 
2178     if (TypeIdx == 0) {
2179       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2180       widenScalarDst(MI, WideTy);
2181     } else {
2182       assert(TypeIdx == 1);
2183       // The "number of bits to shift" operand must preserve its value as an
2184       // unsigned integer:
2185       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2186     }
2187 
2188     Observer.changedInstr(MI);
2189     return Legalized;
2190 
2191   case TargetOpcode::G_SDIV:
2192   case TargetOpcode::G_SREM:
2193   case TargetOpcode::G_SMIN:
2194   case TargetOpcode::G_SMAX:
2195     Observer.changingInstr(MI);
2196     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2197     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2198     widenScalarDst(MI, WideTy);
2199     Observer.changedInstr(MI);
2200     return Legalized;
2201 
2202   case TargetOpcode::G_SDIVREM:
2203     Observer.changingInstr(MI);
2204     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2205     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2206     widenScalarDst(MI, WideTy);
2207     widenScalarDst(MI, WideTy, 1);
2208     Observer.changedInstr(MI);
2209     return Legalized;
2210 
2211   case TargetOpcode::G_ASHR:
2212   case TargetOpcode::G_LSHR:
2213     Observer.changingInstr(MI);
2214 
2215     if (TypeIdx == 0) {
2216       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2217         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2218 
2219       widenScalarSrc(MI, WideTy, 1, CvtOp);
2220       widenScalarDst(MI, WideTy);
2221     } else {
2222       assert(TypeIdx == 1);
2223       // The "number of bits to shift" operand must preserve its value as an
2224       // unsigned integer:
2225       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2226     }
2227 
2228     Observer.changedInstr(MI);
2229     return Legalized;
2230   case TargetOpcode::G_UDIV:
2231   case TargetOpcode::G_UREM:
2232   case TargetOpcode::G_UMIN:
2233   case TargetOpcode::G_UMAX:
2234     Observer.changingInstr(MI);
2235     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2236     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2237     widenScalarDst(MI, WideTy);
2238     Observer.changedInstr(MI);
2239     return Legalized;
2240 
2241   case TargetOpcode::G_UDIVREM:
2242     Observer.changingInstr(MI);
2243     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2244     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2245     widenScalarDst(MI, WideTy);
2246     widenScalarDst(MI, WideTy, 1);
2247     Observer.changedInstr(MI);
2248     return Legalized;
2249 
2250   case TargetOpcode::G_SELECT:
2251     Observer.changingInstr(MI);
2252     if (TypeIdx == 0) {
2253       // Perform operation at larger width (any extension is fine here, high
2254       // bits don't affect the result) and then truncate the result back to the
2255       // original type.
2256       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2257       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2258       widenScalarDst(MI, WideTy);
2259     } else {
2260       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2261       // Explicit extension is required here since high bits affect the result.
2262       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2263     }
2264     Observer.changedInstr(MI);
2265     return Legalized;
2266 
2267   case TargetOpcode::G_FPTOSI:
2268   case TargetOpcode::G_FPTOUI:
2269     Observer.changingInstr(MI);
2270 
2271     if (TypeIdx == 0)
2272       widenScalarDst(MI, WideTy);
2273     else
2274       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2275 
2276     Observer.changedInstr(MI);
2277     return Legalized;
2278   case TargetOpcode::G_SITOFP:
2279     Observer.changingInstr(MI);
2280 
2281     if (TypeIdx == 0)
2282       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2283     else
2284       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2285 
2286     Observer.changedInstr(MI);
2287     return Legalized;
2288   case TargetOpcode::G_UITOFP:
2289     Observer.changingInstr(MI);
2290 
2291     if (TypeIdx == 0)
2292       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2293     else
2294       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2295 
2296     Observer.changedInstr(MI);
2297     return Legalized;
2298   case TargetOpcode::G_LOAD:
2299   case TargetOpcode::G_SEXTLOAD:
2300   case TargetOpcode::G_ZEXTLOAD:
2301     Observer.changingInstr(MI);
2302     widenScalarDst(MI, WideTy);
2303     Observer.changedInstr(MI);
2304     return Legalized;
2305 
2306   case TargetOpcode::G_STORE: {
2307     if (TypeIdx != 0)
2308       return UnableToLegalize;
2309 
2310     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2311     if (!Ty.isScalar())
2312       return UnableToLegalize;
2313 
2314     Observer.changingInstr(MI);
2315 
2316     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2317       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2318     widenScalarSrc(MI, WideTy, 0, ExtType);
2319 
2320     Observer.changedInstr(MI);
2321     return Legalized;
2322   }
2323   case TargetOpcode::G_CONSTANT: {
2324     MachineOperand &SrcMO = MI.getOperand(1);
2325     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2326     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2327         MRI.getType(MI.getOperand(0).getReg()));
2328     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2329             ExtOpc == TargetOpcode::G_ANYEXT) &&
2330            "Illegal Extend");
2331     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2332     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2333                            ? SrcVal.sext(WideTy.getSizeInBits())
2334                            : SrcVal.zext(WideTy.getSizeInBits());
2335     Observer.changingInstr(MI);
2336     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2337 
2338     widenScalarDst(MI, WideTy);
2339     Observer.changedInstr(MI);
2340     return Legalized;
2341   }
2342   case TargetOpcode::G_FCONSTANT: {
2343     MachineOperand &SrcMO = MI.getOperand(1);
2344     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2345     APFloat Val = SrcMO.getFPImm()->getValueAPF();
2346     bool LosesInfo;
2347     switch (WideTy.getSizeInBits()) {
2348     case 32:
2349       Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
2350                   &LosesInfo);
2351       break;
2352     case 64:
2353       Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
2354                   &LosesInfo);
2355       break;
2356     default:
2357       return UnableToLegalize;
2358     }
2359 
2360     assert(!LosesInfo && "extend should always be lossless");
2361 
2362     Observer.changingInstr(MI);
2363     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
2364 
2365     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2366     Observer.changedInstr(MI);
2367     return Legalized;
2368   }
2369   case TargetOpcode::G_IMPLICIT_DEF: {
2370     Observer.changingInstr(MI);
2371     widenScalarDst(MI, WideTy);
2372     Observer.changedInstr(MI);
2373     return Legalized;
2374   }
2375   case TargetOpcode::G_BRCOND:
2376     Observer.changingInstr(MI);
2377     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2378     Observer.changedInstr(MI);
2379     return Legalized;
2380 
2381   case TargetOpcode::G_FCMP:
2382     Observer.changingInstr(MI);
2383     if (TypeIdx == 0)
2384       widenScalarDst(MI, WideTy);
2385     else {
2386       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2387       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2388     }
2389     Observer.changedInstr(MI);
2390     return Legalized;
2391 
2392   case TargetOpcode::G_ICMP:
2393     Observer.changingInstr(MI);
2394     if (TypeIdx == 0)
2395       widenScalarDst(MI, WideTy);
2396     else {
2397       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2398                                MI.getOperand(1).getPredicate()))
2399                                ? TargetOpcode::G_SEXT
2400                                : TargetOpcode::G_ZEXT;
2401       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2402       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2403     }
2404     Observer.changedInstr(MI);
2405     return Legalized;
2406 
2407   case TargetOpcode::G_PTR_ADD:
2408     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2409     Observer.changingInstr(MI);
2410     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2411     Observer.changedInstr(MI);
2412     return Legalized;
2413 
2414   case TargetOpcode::G_PHI: {
2415     assert(TypeIdx == 0 && "Expecting only Idx 0");
2416 
2417     Observer.changingInstr(MI);
2418     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2419       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2420       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2421       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2422     }
2423 
2424     MachineBasicBlock &MBB = *MI.getParent();
2425     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2426     widenScalarDst(MI, WideTy);
2427     Observer.changedInstr(MI);
2428     return Legalized;
2429   }
2430   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2431     if (TypeIdx == 0) {
2432       Register VecReg = MI.getOperand(1).getReg();
2433       LLT VecTy = MRI.getType(VecReg);
2434       Observer.changingInstr(MI);
2435 
2436       widenScalarSrc(
2437           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2438           TargetOpcode::G_SEXT);
2439 
2440       widenScalarDst(MI, WideTy, 0);
2441       Observer.changedInstr(MI);
2442       return Legalized;
2443     }
2444 
2445     if (TypeIdx != 2)
2446       return UnableToLegalize;
2447     Observer.changingInstr(MI);
2448     // TODO: Probably should be zext
2449     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2450     Observer.changedInstr(MI);
2451     return Legalized;
2452   }
2453   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2454     if (TypeIdx == 1) {
2455       Observer.changingInstr(MI);
2456 
2457       Register VecReg = MI.getOperand(1).getReg();
2458       LLT VecTy = MRI.getType(VecReg);
2459       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2460 
2461       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2462       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2463       widenScalarDst(MI, WideVecTy, 0);
2464       Observer.changedInstr(MI);
2465       return Legalized;
2466     }
2467 
2468     if (TypeIdx == 2) {
2469       Observer.changingInstr(MI);
2470       // TODO: Probably should be zext
2471       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2472       Observer.changedInstr(MI);
2473       return Legalized;
2474     }
2475 
2476     return UnableToLegalize;
2477   }
2478   case TargetOpcode::G_FADD:
2479   case TargetOpcode::G_FMUL:
2480   case TargetOpcode::G_FSUB:
2481   case TargetOpcode::G_FMA:
2482   case TargetOpcode::G_FMAD:
2483   case TargetOpcode::G_FNEG:
2484   case TargetOpcode::G_FABS:
2485   case TargetOpcode::G_FCANONICALIZE:
2486   case TargetOpcode::G_FMINNUM:
2487   case TargetOpcode::G_FMAXNUM:
2488   case TargetOpcode::G_FMINNUM_IEEE:
2489   case TargetOpcode::G_FMAXNUM_IEEE:
2490   case TargetOpcode::G_FMINIMUM:
2491   case TargetOpcode::G_FMAXIMUM:
2492   case TargetOpcode::G_FDIV:
2493   case TargetOpcode::G_FREM:
2494   case TargetOpcode::G_FCEIL:
2495   case TargetOpcode::G_FFLOOR:
2496   case TargetOpcode::G_FCOS:
2497   case TargetOpcode::G_FSIN:
2498   case TargetOpcode::G_FLOG10:
2499   case TargetOpcode::G_FLOG:
2500   case TargetOpcode::G_FLOG2:
2501   case TargetOpcode::G_FRINT:
2502   case TargetOpcode::G_FNEARBYINT:
2503   case TargetOpcode::G_FSQRT:
2504   case TargetOpcode::G_FEXP:
2505   case TargetOpcode::G_FEXP2:
2506   case TargetOpcode::G_FPOW:
2507   case TargetOpcode::G_INTRINSIC_TRUNC:
2508   case TargetOpcode::G_INTRINSIC_ROUND:
2509   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2510     assert(TypeIdx == 0);
2511     Observer.changingInstr(MI);
2512 
2513     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2514       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2515 
2516     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2517     Observer.changedInstr(MI);
2518     return Legalized;
2519   case TargetOpcode::G_FPOWI: {
2520     if (TypeIdx != 0)
2521       return UnableToLegalize;
2522     Observer.changingInstr(MI);
2523     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2524     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2525     Observer.changedInstr(MI);
2526     return Legalized;
2527   }
2528   case TargetOpcode::G_INTTOPTR:
2529     if (TypeIdx != 1)
2530       return UnableToLegalize;
2531 
2532     Observer.changingInstr(MI);
2533     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2534     Observer.changedInstr(MI);
2535     return Legalized;
2536   case TargetOpcode::G_PTRTOINT:
2537     if (TypeIdx != 0)
2538       return UnableToLegalize;
2539 
2540     Observer.changingInstr(MI);
2541     widenScalarDst(MI, WideTy, 0);
2542     Observer.changedInstr(MI);
2543     return Legalized;
2544   case TargetOpcode::G_BUILD_VECTOR: {
2545     Observer.changingInstr(MI);
2546 
2547     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2548     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2549       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2550 
2551     // Avoid changing the result vector type if the source element type was
2552     // requested.
2553     if (TypeIdx == 1) {
2554       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2555     } else {
2556       widenScalarDst(MI, WideTy, 0);
2557     }
2558 
2559     Observer.changedInstr(MI);
2560     return Legalized;
2561   }
2562   case TargetOpcode::G_SEXT_INREG:
2563     if (TypeIdx != 0)
2564       return UnableToLegalize;
2565 
2566     Observer.changingInstr(MI);
2567     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2568     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2569     Observer.changedInstr(MI);
2570     return Legalized;
2571   case TargetOpcode::G_PTRMASK: {
2572     if (TypeIdx != 1)
2573       return UnableToLegalize;
2574     Observer.changingInstr(MI);
2575     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2576     Observer.changedInstr(MI);
2577     return Legalized;
2578   }
2579   }
2580 }
2581 
2582 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2583                              MachineIRBuilder &B, Register Src, LLT Ty) {
2584   auto Unmerge = B.buildUnmerge(Ty, Src);
2585   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2586     Pieces.push_back(Unmerge.getReg(I));
2587 }
2588 
2589 LegalizerHelper::LegalizeResult
2590 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2591   Register Dst = MI.getOperand(0).getReg();
2592   Register Src = MI.getOperand(1).getReg();
2593   LLT DstTy = MRI.getType(Dst);
2594   LLT SrcTy = MRI.getType(Src);
2595 
2596   if (SrcTy.isVector()) {
2597     LLT SrcEltTy = SrcTy.getElementType();
2598     SmallVector<Register, 8> SrcRegs;
2599 
2600     if (DstTy.isVector()) {
2601       int NumDstElt = DstTy.getNumElements();
2602       int NumSrcElt = SrcTy.getNumElements();
2603 
2604       LLT DstEltTy = DstTy.getElementType();
2605       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2606       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2607 
2608       // If there's an element size mismatch, insert intermediate casts to match
2609       // the result element type.
2610       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2611         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2612         //
2613         // =>
2614         //
2615         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2616         // %3:_(<2 x s8>) = G_BITCAST %2
2617         // %4:_(<2 x s8>) = G_BITCAST %3
2618         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2619         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2620         SrcPartTy = SrcEltTy;
2621       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2622         //
2623         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2624         //
2625         // =>
2626         //
2627         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2628         // %3:_(s16) = G_BITCAST %2
2629         // %4:_(s16) = G_BITCAST %3
2630         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2631         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2632         DstCastTy = DstEltTy;
2633       }
2634 
2635       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2636       for (Register &SrcReg : SrcRegs)
2637         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2638     } else
2639       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2640 
2641     MIRBuilder.buildMerge(Dst, SrcRegs);
2642     MI.eraseFromParent();
2643     return Legalized;
2644   }
2645 
2646   if (DstTy.isVector()) {
2647     SmallVector<Register, 8> SrcRegs;
2648     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2649     MIRBuilder.buildMerge(Dst, SrcRegs);
2650     MI.eraseFromParent();
2651     return Legalized;
2652   }
2653 
2654   return UnableToLegalize;
2655 }
2656 
2657 /// Figure out the bit offset into a register when coercing a vector index for
2658 /// the wide element type. This is only for the case when promoting vector to
2659 /// one with larger elements.
2660 //
2661 ///
2662 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2663 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2664 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2665                                                    Register Idx,
2666                                                    unsigned NewEltSize,
2667                                                    unsigned OldEltSize) {
2668   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2669   LLT IdxTy = B.getMRI()->getType(Idx);
2670 
2671   // Now figure out the amount we need to shift to get the target bits.
2672   auto OffsetMask = B.buildConstant(
2673     IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio));
2674   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2675   return B.buildShl(IdxTy, OffsetIdx,
2676                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2677 }
2678 
2679 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2680 /// is casting to a vector with a smaller element size, perform multiple element
2681 /// extracts and merge the results. If this is coercing to a vector with larger
2682 /// elements, index the bitcasted vector and extract the target element with bit
2683 /// operations. This is intended to force the indexing in the native register
2684 /// size for architectures that can dynamically index the register file.
2685 LegalizerHelper::LegalizeResult
2686 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2687                                          LLT CastTy) {
2688   if (TypeIdx != 1)
2689     return UnableToLegalize;
2690 
2691   Register Dst = MI.getOperand(0).getReg();
2692   Register SrcVec = MI.getOperand(1).getReg();
2693   Register Idx = MI.getOperand(2).getReg();
2694   LLT SrcVecTy = MRI.getType(SrcVec);
2695   LLT IdxTy = MRI.getType(Idx);
2696 
2697   LLT SrcEltTy = SrcVecTy.getElementType();
2698   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2699   unsigned OldNumElts = SrcVecTy.getNumElements();
2700 
2701   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2702   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2703 
2704   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2705   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2706   if (NewNumElts > OldNumElts) {
2707     // Decreasing the vector element size
2708     //
2709     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2710     //  =>
2711     //  v4i32:castx = bitcast x:v2i64
2712     //
2713     // i64 = bitcast
2714     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2715     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2716     //
2717     if (NewNumElts % OldNumElts != 0)
2718       return UnableToLegalize;
2719 
2720     // Type of the intermediate result vector.
2721     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2722     LLT MidTy =
2723         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2724 
2725     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2726 
2727     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2728     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2729 
2730     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2731       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2732       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2733       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2734       NewOps[I] = Elt.getReg(0);
2735     }
2736 
2737     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2738     MIRBuilder.buildBitcast(Dst, NewVec);
2739     MI.eraseFromParent();
2740     return Legalized;
2741   }
2742 
2743   if (NewNumElts < OldNumElts) {
2744     if (NewEltSize % OldEltSize != 0)
2745       return UnableToLegalize;
2746 
2747     // This only depends on powers of 2 because we use bit tricks to figure out
2748     // the bit offset we need to shift to get the target element. A general
2749     // expansion could emit division/multiply.
2750     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2751       return UnableToLegalize;
2752 
2753     // Increasing the vector element size.
2754     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2755     //
2756     //   =>
2757     //
2758     // %cast = G_BITCAST %vec
2759     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2760     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2761     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2762     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2763     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2764     // %elt = G_TRUNC %elt_bits
2765 
2766     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2767     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2768 
2769     // Divide to get the index in the wider element type.
2770     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2771 
2772     Register WideElt = CastVec;
2773     if (CastTy.isVector()) {
2774       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2775                                                      ScaledIdx).getReg(0);
2776     }
2777 
2778     // Compute the bit offset into the register of the target element.
2779     Register OffsetBits = getBitcastWiderVectorElementOffset(
2780       MIRBuilder, Idx, NewEltSize, OldEltSize);
2781 
2782     // Shift the wide element to get the target element.
2783     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2784     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2785     MI.eraseFromParent();
2786     return Legalized;
2787   }
2788 
2789   return UnableToLegalize;
2790 }
2791 
2792 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2793 /// TargetReg, while preserving other bits in \p TargetReg.
2794 ///
2795 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2796 static Register buildBitFieldInsert(MachineIRBuilder &B,
2797                                     Register TargetReg, Register InsertReg,
2798                                     Register OffsetBits) {
2799   LLT TargetTy = B.getMRI()->getType(TargetReg);
2800   LLT InsertTy = B.getMRI()->getType(InsertReg);
2801   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2802   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2803 
2804   // Produce a bitmask of the value to insert
2805   auto EltMask = B.buildConstant(
2806     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2807                                    InsertTy.getSizeInBits()));
2808   // Shift it into position
2809   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2810   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2811 
2812   // Clear out the bits in the wide element
2813   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2814 
2815   // The value to insert has all zeros already, so stick it into the masked
2816   // wide element.
2817   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2818 }
2819 
2820 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2821 /// is increasing the element size, perform the indexing in the target element
2822 /// type, and use bit operations to insert at the element position. This is
2823 /// intended for architectures that can dynamically index the register file and
2824 /// want to force indexing in the native register size.
2825 LegalizerHelper::LegalizeResult
2826 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2827                                         LLT CastTy) {
2828   if (TypeIdx != 0)
2829     return UnableToLegalize;
2830 
2831   Register Dst = MI.getOperand(0).getReg();
2832   Register SrcVec = MI.getOperand(1).getReg();
2833   Register Val = MI.getOperand(2).getReg();
2834   Register Idx = MI.getOperand(3).getReg();
2835 
2836   LLT VecTy = MRI.getType(Dst);
2837   LLT IdxTy = MRI.getType(Idx);
2838 
2839   LLT VecEltTy = VecTy.getElementType();
2840   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2841   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2842   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2843 
2844   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2845   unsigned OldNumElts = VecTy.getNumElements();
2846 
2847   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2848   if (NewNumElts < OldNumElts) {
2849     if (NewEltSize % OldEltSize != 0)
2850       return UnableToLegalize;
2851 
2852     // This only depends on powers of 2 because we use bit tricks to figure out
2853     // the bit offset we need to shift to get the target element. A general
2854     // expansion could emit division/multiply.
2855     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2856       return UnableToLegalize;
2857 
2858     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2859     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2860 
2861     // Divide to get the index in the wider element type.
2862     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2863 
2864     Register ExtractedElt = CastVec;
2865     if (CastTy.isVector()) {
2866       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2867                                                           ScaledIdx).getReg(0);
2868     }
2869 
2870     // Compute the bit offset into the register of the target element.
2871     Register OffsetBits = getBitcastWiderVectorElementOffset(
2872       MIRBuilder, Idx, NewEltSize, OldEltSize);
2873 
2874     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2875                                                Val, OffsetBits);
2876     if (CastTy.isVector()) {
2877       InsertedElt = MIRBuilder.buildInsertVectorElement(
2878         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2879     }
2880 
2881     MIRBuilder.buildBitcast(Dst, InsertedElt);
2882     MI.eraseFromParent();
2883     return Legalized;
2884   }
2885 
2886   return UnableToLegalize;
2887 }
2888 
2889 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2890   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2891   Register DstReg = LoadMI.getDstReg();
2892   Register PtrReg = LoadMI.getPointerReg();
2893   LLT DstTy = MRI.getType(DstReg);
2894   MachineMemOperand &MMO = LoadMI.getMMO();
2895   LLT MemTy = MMO.getMemoryType();
2896   MachineFunction &MF = MIRBuilder.getMF();
2897 
2898   unsigned MemSizeInBits = MemTy.getSizeInBits();
2899   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2900 
2901   if (MemSizeInBits != MemStoreSizeInBits) {
2902     if (MemTy.isVector())
2903       return UnableToLegalize;
2904 
2905     // Promote to a byte-sized load if not loading an integral number of
2906     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2907     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2908     MachineMemOperand *NewMMO =
2909         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2910 
2911     Register LoadReg = DstReg;
2912     LLT LoadTy = DstTy;
2913 
2914     // If this wasn't already an extending load, we need to widen the result
2915     // register to avoid creating a load with a narrower result than the source.
2916     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2917       LoadTy = WideMemTy;
2918       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2919     }
2920 
2921     if (isa<GSExtLoad>(LoadMI)) {
2922       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2923       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2924     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) {
2925       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2926       // The extra bits are guaranteed to be zero, since we stored them that
2927       // way.  A zext load from Wide thus automatically gives zext from MemVT.
2928       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
2929     } else {
2930       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
2931     }
2932 
2933     if (DstTy != LoadTy)
2934       MIRBuilder.buildTrunc(DstReg, LoadReg);
2935 
2936     LoadMI.eraseFromParent();
2937     return Legalized;
2938   }
2939 
2940   // Big endian lowering not implemented.
2941   if (MIRBuilder.getDataLayout().isBigEndian())
2942     return UnableToLegalize;
2943 
2944   // This load needs splitting into power of 2 sized loads.
2945   //
2946   // Our strategy here is to generate anyextending loads for the smaller
2947   // types up to next power-2 result type, and then combine the two larger
2948   // result values together, before truncating back down to the non-pow-2
2949   // type.
2950   // E.g. v1 = i24 load =>
2951   // v2 = i32 zextload (2 byte)
2952   // v3 = i32 load (1 byte)
2953   // v4 = i32 shl v3, 16
2954   // v5 = i32 or v4, v2
2955   // v1 = i24 trunc v5
2956   // By doing this we generate the correct truncate which should get
2957   // combined away as an artifact with a matching extend.
2958 
2959   uint64_t LargeSplitSize, SmallSplitSize;
2960 
2961   if (!isPowerOf2_32(MemSizeInBits)) {
2962     // This load needs splitting into power of 2 sized loads.
2963     LargeSplitSize = PowerOf2Floor(MemSizeInBits);
2964     SmallSplitSize = MemSizeInBits - LargeSplitSize;
2965   } else {
2966     // This is already a power of 2, but we still need to split this in half.
2967     //
2968     // Assume we're being asked to decompose an unaligned load.
2969     // TODO: If this requires multiple splits, handle them all at once.
2970     auto &Ctx = MF.getFunction().getContext();
2971     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
2972       return UnableToLegalize;
2973 
2974     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
2975   }
2976 
2977   if (MemTy.isVector()) {
2978     // TODO: Handle vector extloads
2979     if (MemTy != DstTy)
2980       return UnableToLegalize;
2981 
2982     // TODO: We can do better than scalarizing the vector and at least split it
2983     // in half.
2984     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
2985   }
2986 
2987   MachineMemOperand *LargeMMO =
2988       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
2989   MachineMemOperand *SmallMMO =
2990       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
2991 
2992   LLT PtrTy = MRI.getType(PtrReg);
2993   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
2994   LLT AnyExtTy = LLT::scalar(AnyExtSize);
2995   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
2996                                              PtrReg, *LargeMMO);
2997 
2998   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
2999                                             LargeSplitSize / 8);
3000   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3001   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3002   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3003                                              SmallPtr, *SmallMMO);
3004 
3005   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3006   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3007 
3008   if (AnyExtTy == DstTy)
3009     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3010   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3011     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3012     MIRBuilder.buildTrunc(DstReg, {Or});
3013   } else {
3014     assert(DstTy.isPointer() && "expected pointer");
3015     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3016 
3017     // FIXME: We currently consider this to be illegal for non-integral address
3018     // spaces, but we need still need a way to reinterpret the bits.
3019     MIRBuilder.buildIntToPtr(DstReg, Or);
3020   }
3021 
3022   LoadMI.eraseFromParent();
3023   return Legalized;
3024 }
3025 
3026 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3027   // Lower a non-power of 2 store into multiple pow-2 stores.
3028   // E.g. split an i24 store into an i16 store + i8 store.
3029   // We do this by first extending the stored value to the next largest power
3030   // of 2 type, and then using truncating stores to store the components.
3031   // By doing this, likewise with G_LOAD, generate an extend that can be
3032   // artifact-combined away instead of leaving behind extracts.
3033   Register SrcReg = StoreMI.getValueReg();
3034   Register PtrReg = StoreMI.getPointerReg();
3035   LLT SrcTy = MRI.getType(SrcReg);
3036   MachineFunction &MF = MIRBuilder.getMF();
3037   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3038   LLT MemTy = MMO.getMemoryType();
3039 
3040   unsigned StoreWidth = MemTy.getSizeInBits();
3041   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3042 
3043   if (StoreWidth != StoreSizeInBits) {
3044     if (SrcTy.isVector())
3045       return UnableToLegalize;
3046 
3047     // Promote to a byte-sized store with upper bits zero if not
3048     // storing an integral number of bytes.  For example, promote
3049     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3050     LLT WideTy = LLT::scalar(StoreSizeInBits);
3051 
3052     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3053       // Avoid creating a store with a narrower source than result.
3054       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3055       SrcTy = WideTy;
3056     }
3057 
3058     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3059 
3060     MachineMemOperand *NewMMO =
3061         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3062     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3063     StoreMI.eraseFromParent();
3064     return Legalized;
3065   }
3066 
3067   if (MemTy.isVector()) {
3068     // TODO: Handle vector trunc stores
3069     if (MemTy != SrcTy)
3070       return UnableToLegalize;
3071 
3072     // TODO: We can do better than scalarizing the vector and at least split it
3073     // in half.
3074     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3075   }
3076 
3077   unsigned MemSizeInBits = MemTy.getSizeInBits();
3078   uint64_t LargeSplitSize, SmallSplitSize;
3079 
3080   if (!isPowerOf2_32(MemSizeInBits)) {
3081     LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
3082     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3083   } else {
3084     auto &Ctx = MF.getFunction().getContext();
3085     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3086       return UnableToLegalize; // Don't know what we're being asked to do.
3087 
3088     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3089   }
3090 
3091   // Extend to the next pow-2. If this store was itself the result of lowering,
3092   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3093   // that's wider than the stored size.
3094   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3095   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3096 
3097   if (SrcTy.isPointer()) {
3098     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3099     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3100   }
3101 
3102   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3103 
3104   // Obtain the smaller value by shifting away the larger value.
3105   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3106   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3107 
3108   // Generate the PtrAdd and truncating stores.
3109   LLT PtrTy = MRI.getType(PtrReg);
3110   auto OffsetCst = MIRBuilder.buildConstant(
3111     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3112   auto SmallPtr =
3113     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3114 
3115   MachineMemOperand *LargeMMO =
3116     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3117   MachineMemOperand *SmallMMO =
3118     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3119   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3120   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3121   StoreMI.eraseFromParent();
3122   return Legalized;
3123 }
3124 
3125 LegalizerHelper::LegalizeResult
3126 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3127   switch (MI.getOpcode()) {
3128   case TargetOpcode::G_LOAD: {
3129     if (TypeIdx != 0)
3130       return UnableToLegalize;
3131     MachineMemOperand &MMO = **MI.memoperands_begin();
3132 
3133     // Not sure how to interpret a bitcast of an extending load.
3134     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3135       return UnableToLegalize;
3136 
3137     Observer.changingInstr(MI);
3138     bitcastDst(MI, CastTy, 0);
3139     MMO.setType(CastTy);
3140     Observer.changedInstr(MI);
3141     return Legalized;
3142   }
3143   case TargetOpcode::G_STORE: {
3144     if (TypeIdx != 0)
3145       return UnableToLegalize;
3146 
3147     MachineMemOperand &MMO = **MI.memoperands_begin();
3148 
3149     // Not sure how to interpret a bitcast of a truncating store.
3150     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3151       return UnableToLegalize;
3152 
3153     Observer.changingInstr(MI);
3154     bitcastSrc(MI, CastTy, 0);
3155     MMO.setType(CastTy);
3156     Observer.changedInstr(MI);
3157     return Legalized;
3158   }
3159   case TargetOpcode::G_SELECT: {
3160     if (TypeIdx != 0)
3161       return UnableToLegalize;
3162 
3163     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3164       LLVM_DEBUG(
3165           dbgs() << "bitcast action not implemented for vector select\n");
3166       return UnableToLegalize;
3167     }
3168 
3169     Observer.changingInstr(MI);
3170     bitcastSrc(MI, CastTy, 2);
3171     bitcastSrc(MI, CastTy, 3);
3172     bitcastDst(MI, CastTy, 0);
3173     Observer.changedInstr(MI);
3174     return Legalized;
3175   }
3176   case TargetOpcode::G_AND:
3177   case TargetOpcode::G_OR:
3178   case TargetOpcode::G_XOR: {
3179     Observer.changingInstr(MI);
3180     bitcastSrc(MI, CastTy, 1);
3181     bitcastSrc(MI, CastTy, 2);
3182     bitcastDst(MI, CastTy, 0);
3183     Observer.changedInstr(MI);
3184     return Legalized;
3185   }
3186   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3187     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3188   case TargetOpcode::G_INSERT_VECTOR_ELT:
3189     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3190   default:
3191     return UnableToLegalize;
3192   }
3193 }
3194 
3195 // Legalize an instruction by changing the opcode in place.
3196 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3197     Observer.changingInstr(MI);
3198     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3199     Observer.changedInstr(MI);
3200 }
3201 
3202 LegalizerHelper::LegalizeResult
3203 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3204   using namespace TargetOpcode;
3205 
3206   switch(MI.getOpcode()) {
3207   default:
3208     return UnableToLegalize;
3209   case TargetOpcode::G_BITCAST:
3210     return lowerBitcast(MI);
3211   case TargetOpcode::G_SREM:
3212   case TargetOpcode::G_UREM: {
3213     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3214     auto Quot =
3215         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3216                               {MI.getOperand(1), MI.getOperand(2)});
3217 
3218     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3219     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3220     MI.eraseFromParent();
3221     return Legalized;
3222   }
3223   case TargetOpcode::G_SADDO:
3224   case TargetOpcode::G_SSUBO:
3225     return lowerSADDO_SSUBO(MI);
3226   case TargetOpcode::G_UMULH:
3227   case TargetOpcode::G_SMULH:
3228     return lowerSMULH_UMULH(MI);
3229   case TargetOpcode::G_SMULO:
3230   case TargetOpcode::G_UMULO: {
3231     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3232     // result.
3233     Register Res = MI.getOperand(0).getReg();
3234     Register Overflow = MI.getOperand(1).getReg();
3235     Register LHS = MI.getOperand(2).getReg();
3236     Register RHS = MI.getOperand(3).getReg();
3237     LLT Ty = MRI.getType(Res);
3238 
3239     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3240                           ? TargetOpcode::G_SMULH
3241                           : TargetOpcode::G_UMULH;
3242 
3243     Observer.changingInstr(MI);
3244     const auto &TII = MIRBuilder.getTII();
3245     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3246     MI.RemoveOperand(1);
3247     Observer.changedInstr(MI);
3248 
3249     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3250     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3251 
3252     // Move insert point forward so we can use the Res register if needed.
3253     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3254 
3255     // For *signed* multiply, overflow is detected by checking:
3256     // (hi != (lo >> bitwidth-1))
3257     if (Opcode == TargetOpcode::G_SMULH) {
3258       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3259       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3260       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3261     } else {
3262       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3263     }
3264     return Legalized;
3265   }
3266   case TargetOpcode::G_FNEG: {
3267     Register Res = MI.getOperand(0).getReg();
3268     LLT Ty = MRI.getType(Res);
3269 
3270     // TODO: Handle vector types once we are able to
3271     // represent them.
3272     if (Ty.isVector())
3273       return UnableToLegalize;
3274     auto SignMask =
3275         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3276     Register SubByReg = MI.getOperand(1).getReg();
3277     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3278     MI.eraseFromParent();
3279     return Legalized;
3280   }
3281   case TargetOpcode::G_FSUB: {
3282     Register Res = MI.getOperand(0).getReg();
3283     LLT Ty = MRI.getType(Res);
3284 
3285     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3286     // First, check if G_FNEG is marked as Lower. If so, we may
3287     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3288     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3289       return UnableToLegalize;
3290     Register LHS = MI.getOperand(1).getReg();
3291     Register RHS = MI.getOperand(2).getReg();
3292     Register Neg = MRI.createGenericVirtualRegister(Ty);
3293     MIRBuilder.buildFNeg(Neg, RHS);
3294     MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3295     MI.eraseFromParent();
3296     return Legalized;
3297   }
3298   case TargetOpcode::G_FMAD:
3299     return lowerFMad(MI);
3300   case TargetOpcode::G_FFLOOR:
3301     return lowerFFloor(MI);
3302   case TargetOpcode::G_INTRINSIC_ROUND:
3303     return lowerIntrinsicRound(MI);
3304   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3305     // Since round even is the assumed rounding mode for unconstrained FP
3306     // operations, rint and roundeven are the same operation.
3307     changeOpcode(MI, TargetOpcode::G_FRINT);
3308     return Legalized;
3309   }
3310   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3311     Register OldValRes = MI.getOperand(0).getReg();
3312     Register SuccessRes = MI.getOperand(1).getReg();
3313     Register Addr = MI.getOperand(2).getReg();
3314     Register CmpVal = MI.getOperand(3).getReg();
3315     Register NewVal = MI.getOperand(4).getReg();
3316     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3317                                   **MI.memoperands_begin());
3318     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3319     MI.eraseFromParent();
3320     return Legalized;
3321   }
3322   case TargetOpcode::G_LOAD:
3323   case TargetOpcode::G_SEXTLOAD:
3324   case TargetOpcode::G_ZEXTLOAD:
3325     return lowerLoad(cast<GAnyLoad>(MI));
3326   case TargetOpcode::G_STORE:
3327     return lowerStore(cast<GStore>(MI));
3328   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3329   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3330   case TargetOpcode::G_CTLZ:
3331   case TargetOpcode::G_CTTZ:
3332   case TargetOpcode::G_CTPOP:
3333     return lowerBitCount(MI);
3334   case G_UADDO: {
3335     Register Res = MI.getOperand(0).getReg();
3336     Register CarryOut = MI.getOperand(1).getReg();
3337     Register LHS = MI.getOperand(2).getReg();
3338     Register RHS = MI.getOperand(3).getReg();
3339 
3340     MIRBuilder.buildAdd(Res, LHS, RHS);
3341     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3342 
3343     MI.eraseFromParent();
3344     return Legalized;
3345   }
3346   case G_UADDE: {
3347     Register Res = MI.getOperand(0).getReg();
3348     Register CarryOut = MI.getOperand(1).getReg();
3349     Register LHS = MI.getOperand(2).getReg();
3350     Register RHS = MI.getOperand(3).getReg();
3351     Register CarryIn = MI.getOperand(4).getReg();
3352     LLT Ty = MRI.getType(Res);
3353 
3354     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3355     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3356     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3357     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3358 
3359     MI.eraseFromParent();
3360     return Legalized;
3361   }
3362   case G_USUBO: {
3363     Register Res = MI.getOperand(0).getReg();
3364     Register BorrowOut = MI.getOperand(1).getReg();
3365     Register LHS = MI.getOperand(2).getReg();
3366     Register RHS = MI.getOperand(3).getReg();
3367 
3368     MIRBuilder.buildSub(Res, LHS, RHS);
3369     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3370 
3371     MI.eraseFromParent();
3372     return Legalized;
3373   }
3374   case G_USUBE: {
3375     Register Res = MI.getOperand(0).getReg();
3376     Register BorrowOut = MI.getOperand(1).getReg();
3377     Register LHS = MI.getOperand(2).getReg();
3378     Register RHS = MI.getOperand(3).getReg();
3379     Register BorrowIn = MI.getOperand(4).getReg();
3380     const LLT CondTy = MRI.getType(BorrowOut);
3381     const LLT Ty = MRI.getType(Res);
3382 
3383     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3384     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3385     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3386 
3387     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3388     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3389     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3390 
3391     MI.eraseFromParent();
3392     return Legalized;
3393   }
3394   case G_UITOFP:
3395     return lowerUITOFP(MI);
3396   case G_SITOFP:
3397     return lowerSITOFP(MI);
3398   case G_FPTOUI:
3399     return lowerFPTOUI(MI);
3400   case G_FPTOSI:
3401     return lowerFPTOSI(MI);
3402   case G_FPTRUNC:
3403     return lowerFPTRUNC(MI);
3404   case G_FPOWI:
3405     return lowerFPOWI(MI);
3406   case G_SMIN:
3407   case G_SMAX:
3408   case G_UMIN:
3409   case G_UMAX:
3410     return lowerMinMax(MI);
3411   case G_FCOPYSIGN:
3412     return lowerFCopySign(MI);
3413   case G_FMINNUM:
3414   case G_FMAXNUM:
3415     return lowerFMinNumMaxNum(MI);
3416   case G_MERGE_VALUES:
3417     return lowerMergeValues(MI);
3418   case G_UNMERGE_VALUES:
3419     return lowerUnmergeValues(MI);
3420   case TargetOpcode::G_SEXT_INREG: {
3421     assert(MI.getOperand(2).isImm() && "Expected immediate");
3422     int64_t SizeInBits = MI.getOperand(2).getImm();
3423 
3424     Register DstReg = MI.getOperand(0).getReg();
3425     Register SrcReg = MI.getOperand(1).getReg();
3426     LLT DstTy = MRI.getType(DstReg);
3427     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3428 
3429     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3430     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3431     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3432     MI.eraseFromParent();
3433     return Legalized;
3434   }
3435   case G_EXTRACT_VECTOR_ELT:
3436   case G_INSERT_VECTOR_ELT:
3437     return lowerExtractInsertVectorElt(MI);
3438   case G_SHUFFLE_VECTOR:
3439     return lowerShuffleVector(MI);
3440   case G_DYN_STACKALLOC:
3441     return lowerDynStackAlloc(MI);
3442   case G_EXTRACT:
3443     return lowerExtract(MI);
3444   case G_INSERT:
3445     return lowerInsert(MI);
3446   case G_BSWAP:
3447     return lowerBswap(MI);
3448   case G_BITREVERSE:
3449     return lowerBitreverse(MI);
3450   case G_READ_REGISTER:
3451   case G_WRITE_REGISTER:
3452     return lowerReadWriteRegister(MI);
3453   case G_UADDSAT:
3454   case G_USUBSAT: {
3455     // Try to make a reasonable guess about which lowering strategy to use. The
3456     // target can override this with custom lowering and calling the
3457     // implementation functions.
3458     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3459     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3460       return lowerAddSubSatToMinMax(MI);
3461     return lowerAddSubSatToAddoSubo(MI);
3462   }
3463   case G_SADDSAT:
3464   case G_SSUBSAT: {
3465     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3466 
3467     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3468     // since it's a shorter expansion. However, we would need to figure out the
3469     // preferred boolean type for the carry out for the query.
3470     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3471       return lowerAddSubSatToMinMax(MI);
3472     return lowerAddSubSatToAddoSubo(MI);
3473   }
3474   case G_SSHLSAT:
3475   case G_USHLSAT:
3476     return lowerShlSat(MI);
3477   case G_ABS:
3478     return lowerAbsToAddXor(MI);
3479   case G_SELECT:
3480     return lowerSelect(MI);
3481   case G_SDIVREM:
3482   case G_UDIVREM:
3483     return lowerDIVREM(MI);
3484   case G_FSHL:
3485   case G_FSHR:
3486     return lowerFunnelShift(MI);
3487   case G_ROTL:
3488   case G_ROTR:
3489     return lowerRotate(MI);
3490   case G_ISNAN:
3491     return lowerIsNaN(MI);
3492   }
3493 }
3494 
3495 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3496                                                   Align MinAlign) const {
3497   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3498   // datalayout for the preferred alignment. Also there should be a target hook
3499   // for this to allow targets to reduce the alignment and ignore the
3500   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3501   // the type.
3502   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3503 }
3504 
3505 MachineInstrBuilder
3506 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3507                                       MachinePointerInfo &PtrInfo) {
3508   MachineFunction &MF = MIRBuilder.getMF();
3509   const DataLayout &DL = MIRBuilder.getDataLayout();
3510   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3511 
3512   unsigned AddrSpace = DL.getAllocaAddrSpace();
3513   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3514 
3515   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3516   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3517 }
3518 
3519 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3520                                         LLT VecTy) {
3521   int64_t IdxVal;
3522   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3523     return IdxReg;
3524 
3525   LLT IdxTy = B.getMRI()->getType(IdxReg);
3526   unsigned NElts = VecTy.getNumElements();
3527   if (isPowerOf2_32(NElts)) {
3528     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3529     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3530   }
3531 
3532   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3533       .getReg(0);
3534 }
3535 
3536 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3537                                                   Register Index) {
3538   LLT EltTy = VecTy.getElementType();
3539 
3540   // Calculate the element offset and add it to the pointer.
3541   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3542   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3543          "Converting bits to bytes lost precision");
3544 
3545   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3546 
3547   LLT IdxTy = MRI.getType(Index);
3548   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3549                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3550 
3551   LLT PtrTy = MRI.getType(VecPtr);
3552   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3553 }
3554 
3555 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef(
3556     MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) {
3557   Register DstReg = MI.getOperand(0).getReg();
3558   LLT DstTy = MRI.getType(DstReg);
3559   LLT LCMTy = getLCMType(DstTy, NarrowTy);
3560 
3561   unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
3562 
3563   auto NewUndef = MIRBuilder.buildUndef(NarrowTy);
3564   SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0));
3565 
3566   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
3567   MI.eraseFromParent();
3568   return Legalized;
3569 }
3570 
3571 // Handle splitting vector operations which need to have the same number of
3572 // elements in each type index, but each type index may have a different element
3573 // type.
3574 //
3575 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3576 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3577 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3578 //
3579 // Also handles some irregular breakdown cases, e.g.
3580 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3581 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3582 //             s64 = G_SHL s64, s32
3583 LegalizerHelper::LegalizeResult
3584 LegalizerHelper::fewerElementsVectorMultiEltType(
3585   MachineInstr &MI, unsigned TypeIdx, LLT NarrowTyArg) {
3586   if (TypeIdx != 0)
3587     return UnableToLegalize;
3588 
3589   const LLT NarrowTy0 = NarrowTyArg;
3590   const Register DstReg = MI.getOperand(0).getReg();
3591   LLT DstTy = MRI.getType(DstReg);
3592   LLT LeftoverTy0;
3593 
3594   // All of the operands need to have the same number of elements, so if we can
3595   // determine a type breakdown for the result type, we can for all of the
3596   // source types.
3597   int NumParts = getNarrowTypeBreakDown(DstTy, NarrowTy0, LeftoverTy0).first;
3598   if (NumParts < 0)
3599     return UnableToLegalize;
3600 
3601   SmallVector<MachineInstrBuilder, 4> NewInsts;
3602 
3603   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3604   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3605 
3606   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
3607     Register SrcReg = MI.getOperand(I).getReg();
3608     LLT SrcTyI = MRI.getType(SrcReg);
3609     const auto NewEC = NarrowTy0.isVector() ? NarrowTy0.getElementCount()
3610                                             : ElementCount::getFixed(1);
3611     LLT NarrowTyI = LLT::scalarOrVector(NewEC, SrcTyI.getScalarType());
3612     LLT LeftoverTyI;
3613 
3614     // Split this operand into the requested typed registers, and any leftover
3615     // required to reproduce the original type.
3616     if (!extractParts(SrcReg, SrcTyI, NarrowTyI, LeftoverTyI, PartRegs,
3617                       LeftoverRegs))
3618       return UnableToLegalize;
3619 
3620     if (I == 1) {
3621       // For the first operand, create an instruction for each part and setup
3622       // the result.
3623       for (Register PartReg : PartRegs) {
3624         Register PartDstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3625         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3626                                .addDef(PartDstReg)
3627                                .addUse(PartReg));
3628         DstRegs.push_back(PartDstReg);
3629       }
3630 
3631       for (Register LeftoverReg : LeftoverRegs) {
3632         Register PartDstReg = MRI.createGenericVirtualRegister(LeftoverTy0);
3633         NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode())
3634                                .addDef(PartDstReg)
3635                                .addUse(LeftoverReg));
3636         LeftoverDstRegs.push_back(PartDstReg);
3637       }
3638     } else {
3639       assert(NewInsts.size() == PartRegs.size() + LeftoverRegs.size());
3640 
3641       // Add the newly created operand splits to the existing instructions. The
3642       // odd-sized pieces are ordered after the requested NarrowTyArg sized
3643       // pieces.
3644       unsigned InstCount = 0;
3645       for (unsigned J = 0, JE = PartRegs.size(); J != JE; ++J)
3646         NewInsts[InstCount++].addUse(PartRegs[J]);
3647       for (unsigned J = 0, JE = LeftoverRegs.size(); J != JE; ++J)
3648         NewInsts[InstCount++].addUse(LeftoverRegs[J]);
3649     }
3650 
3651     PartRegs.clear();
3652     LeftoverRegs.clear();
3653   }
3654 
3655   // Insert the newly built operations and rebuild the result register.
3656   for (auto &MIB : NewInsts)
3657     MIRBuilder.insertInstr(MIB);
3658 
3659   insertParts(DstReg, DstTy, NarrowTy0, DstRegs, LeftoverTy0, LeftoverDstRegs);
3660 
3661   MI.eraseFromParent();
3662   return Legalized;
3663 }
3664 
3665 LegalizerHelper::LegalizeResult
3666 LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx,
3667                                           LLT NarrowTy) {
3668   if (TypeIdx != 0)
3669     return UnableToLegalize;
3670 
3671   Register DstReg = MI.getOperand(0).getReg();
3672   Register SrcReg = MI.getOperand(1).getReg();
3673   LLT DstTy = MRI.getType(DstReg);
3674   LLT SrcTy = MRI.getType(SrcReg);
3675 
3676   LLT NarrowTy0 = NarrowTy;
3677   LLT NarrowTy1;
3678   unsigned NumParts;
3679 
3680   if (NarrowTy.isVector()) {
3681     // Uneven breakdown not handled.
3682     NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3683     if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements())
3684       return UnableToLegalize;
3685 
3686     NarrowTy1 = LLT::vector(NarrowTy.getElementCount(), SrcTy.getElementType());
3687   } else {
3688     NumParts = DstTy.getNumElements();
3689     NarrowTy1 = SrcTy.getElementType();
3690   }
3691 
3692   SmallVector<Register, 4> SrcRegs, DstRegs;
3693   extractParts(SrcReg, NarrowTy1, NumParts, SrcRegs);
3694 
3695   for (unsigned I = 0; I < NumParts; ++I) {
3696     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3697     MachineInstr *NewInst =
3698         MIRBuilder.buildInstr(MI.getOpcode(), {DstReg}, {SrcRegs[I]});
3699 
3700     NewInst->setFlags(MI.getFlags());
3701     DstRegs.push_back(DstReg);
3702   }
3703 
3704   if (NarrowTy.isVector())
3705     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3706   else
3707     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3708 
3709   MI.eraseFromParent();
3710   return Legalized;
3711 }
3712 
3713 LegalizerHelper::LegalizeResult
3714 LegalizerHelper::fewerElementsVectorCmp(MachineInstr &MI, unsigned TypeIdx,
3715                                         LLT NarrowTy) {
3716   Register DstReg = MI.getOperand(0).getReg();
3717   Register Src0Reg = MI.getOperand(2).getReg();
3718   LLT DstTy = MRI.getType(DstReg);
3719   LLT SrcTy = MRI.getType(Src0Reg);
3720 
3721   unsigned NumParts;
3722   LLT NarrowTy0, NarrowTy1;
3723 
3724   if (TypeIdx == 0) {
3725     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3726     unsigned OldElts = DstTy.getNumElements();
3727 
3728     NarrowTy0 = NarrowTy;
3729     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : DstTy.getNumElements();
3730     NarrowTy1 = NarrowTy.isVector() ? LLT::vector(NarrowTy.getElementCount(),
3731                                                   SrcTy.getScalarSizeInBits())
3732                                     : SrcTy.getElementType();
3733 
3734   } else {
3735     unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
3736     unsigned OldElts = SrcTy.getNumElements();
3737 
3738     NumParts = NarrowTy.isVector() ? (OldElts / NewElts) :
3739       NarrowTy.getNumElements();
3740     NarrowTy0 =
3741         LLT::vector(NarrowTy.getElementCount(), DstTy.getScalarSizeInBits());
3742     NarrowTy1 = NarrowTy;
3743   }
3744 
3745   // FIXME: Don't know how to handle the situation where the small vectors
3746   // aren't all the same size yet.
3747   if (NarrowTy1.isVector() &&
3748       NarrowTy1.getNumElements() * NumParts != DstTy.getNumElements())
3749     return UnableToLegalize;
3750 
3751   CmpInst::Predicate Pred
3752     = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3753 
3754   SmallVector<Register, 2> Src1Regs, Src2Regs, DstRegs;
3755   extractParts(MI.getOperand(2).getReg(), NarrowTy1, NumParts, Src1Regs);
3756   extractParts(MI.getOperand(3).getReg(), NarrowTy1, NumParts, Src2Regs);
3757 
3758   for (unsigned I = 0; I < NumParts; ++I) {
3759     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3760     DstRegs.push_back(DstReg);
3761 
3762     if (MI.getOpcode() == TargetOpcode::G_ICMP)
3763       MIRBuilder.buildICmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3764     else {
3765       MachineInstr *NewCmp
3766         = MIRBuilder.buildFCmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]);
3767       NewCmp->setFlags(MI.getFlags());
3768     }
3769   }
3770 
3771   if (NarrowTy1.isVector())
3772     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3773   else
3774     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3775 
3776   MI.eraseFromParent();
3777   return Legalized;
3778 }
3779 
3780 LegalizerHelper::LegalizeResult
3781 LegalizerHelper::fewerElementsVectorSelect(MachineInstr &MI, unsigned TypeIdx,
3782                                            LLT NarrowTy) {
3783   Register DstReg = MI.getOperand(0).getReg();
3784   Register CondReg = MI.getOperand(1).getReg();
3785 
3786   unsigned NumParts = 0;
3787   LLT NarrowTy0, NarrowTy1;
3788 
3789   LLT DstTy = MRI.getType(DstReg);
3790   LLT CondTy = MRI.getType(CondReg);
3791   unsigned Size = DstTy.getSizeInBits();
3792 
3793   assert(TypeIdx == 0 || CondTy.isVector());
3794 
3795   if (TypeIdx == 0) {
3796     NarrowTy0 = NarrowTy;
3797     NarrowTy1 = CondTy;
3798 
3799     unsigned NarrowSize = NarrowTy0.getSizeInBits();
3800     // FIXME: Don't know how to handle the situation where the small vectors
3801     // aren't all the same size yet.
3802     if (Size % NarrowSize != 0)
3803       return UnableToLegalize;
3804 
3805     NumParts = Size / NarrowSize;
3806 
3807     // Need to break down the condition type
3808     if (CondTy.isVector()) {
3809       if (CondTy.getNumElements() == NumParts)
3810         NarrowTy1 = CondTy.getElementType();
3811       else
3812         NarrowTy1 =
3813             LLT::vector(CondTy.getElementCount().divideCoefficientBy(NumParts),
3814                         CondTy.getScalarSizeInBits());
3815     }
3816   } else {
3817     NumParts = CondTy.getNumElements();
3818     if (NarrowTy.isVector()) {
3819       // TODO: Handle uneven breakdown.
3820       if (NumParts * NarrowTy.getNumElements() != CondTy.getNumElements())
3821         return UnableToLegalize;
3822 
3823       return UnableToLegalize;
3824     } else {
3825       NarrowTy0 = DstTy.getElementType();
3826       NarrowTy1 = NarrowTy;
3827     }
3828   }
3829 
3830   SmallVector<Register, 2> DstRegs, Src0Regs, Src1Regs, Src2Regs;
3831   if (CondTy.isVector())
3832     extractParts(MI.getOperand(1).getReg(), NarrowTy1, NumParts, Src0Regs);
3833 
3834   extractParts(MI.getOperand(2).getReg(), NarrowTy0, NumParts, Src1Regs);
3835   extractParts(MI.getOperand(3).getReg(), NarrowTy0, NumParts, Src2Regs);
3836 
3837   for (unsigned i = 0; i < NumParts; ++i) {
3838     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
3839     MIRBuilder.buildSelect(DstReg, CondTy.isVector() ? Src0Regs[i] : CondReg,
3840                            Src1Regs[i], Src2Regs[i]);
3841     DstRegs.push_back(DstReg);
3842   }
3843 
3844   if (NarrowTy0.isVector())
3845     MIRBuilder.buildConcatVectors(DstReg, DstRegs);
3846   else
3847     MIRBuilder.buildBuildVector(DstReg, DstRegs);
3848 
3849   MI.eraseFromParent();
3850   return Legalized;
3851 }
3852 
3853 LegalizerHelper::LegalizeResult
3854 LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
3855                                         LLT NarrowTy) {
3856   const Register DstReg = MI.getOperand(0).getReg();
3857   LLT PhiTy = MRI.getType(DstReg);
3858   LLT LeftoverTy;
3859 
3860   // All of the operands need to have the same number of elements, so if we can
3861   // determine a type breakdown for the result type, we can for all of the
3862   // source types.
3863   int NumParts, NumLeftover;
3864   std::tie(NumParts, NumLeftover)
3865     = getNarrowTypeBreakDown(PhiTy, NarrowTy, LeftoverTy);
3866   if (NumParts < 0)
3867     return UnableToLegalize;
3868 
3869   SmallVector<Register, 4> DstRegs, LeftoverDstRegs;
3870   SmallVector<MachineInstrBuilder, 4> NewInsts;
3871 
3872   const int TotalNumParts = NumParts + NumLeftover;
3873 
3874   // Insert the new phis in the result block first.
3875   for (int I = 0; I != TotalNumParts; ++I) {
3876     LLT Ty = I < NumParts ? NarrowTy : LeftoverTy;
3877     Register PartDstReg = MRI.createGenericVirtualRegister(Ty);
3878     NewInsts.push_back(MIRBuilder.buildInstr(TargetOpcode::G_PHI)
3879                        .addDef(PartDstReg));
3880     if (I < NumParts)
3881       DstRegs.push_back(PartDstReg);
3882     else
3883       LeftoverDstRegs.push_back(PartDstReg);
3884   }
3885 
3886   MachineBasicBlock *MBB = MI.getParent();
3887   MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI());
3888   insertParts(DstReg, PhiTy, NarrowTy, DstRegs, LeftoverTy, LeftoverDstRegs);
3889 
3890   SmallVector<Register, 4> PartRegs, LeftoverRegs;
3891 
3892   // Insert code to extract the incoming values in each predecessor block.
3893   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3894     PartRegs.clear();
3895     LeftoverRegs.clear();
3896 
3897     Register SrcReg = MI.getOperand(I).getReg();
3898     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
3899     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3900 
3901     LLT Unused;
3902     if (!extractParts(SrcReg, PhiTy, NarrowTy, Unused, PartRegs,
3903                       LeftoverRegs))
3904       return UnableToLegalize;
3905 
3906     // Add the newly created operand splits to the existing instructions. The
3907     // odd-sized pieces are ordered after the requested NarrowTyArg sized
3908     // pieces.
3909     for (int J = 0; J != TotalNumParts; ++J) {
3910       MachineInstrBuilder MIB = NewInsts[J];
3911       MIB.addUse(J < NumParts ? PartRegs[J] : LeftoverRegs[J - NumParts]);
3912       MIB.addMBB(&OpMBB);
3913     }
3914   }
3915 
3916   MI.eraseFromParent();
3917   return Legalized;
3918 }
3919 
3920 LegalizerHelper::LegalizeResult
3921 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3922                                                   unsigned TypeIdx,
3923                                                   LLT NarrowTy) {
3924   if (TypeIdx != 1)
3925     return UnableToLegalize;
3926 
3927   const int NumDst = MI.getNumOperands() - 1;
3928   const Register SrcReg = MI.getOperand(NumDst).getReg();
3929   LLT SrcTy = MRI.getType(SrcReg);
3930 
3931   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3932 
3933   // TODO: Create sequence of extracts.
3934   if (DstTy == NarrowTy)
3935     return UnableToLegalize;
3936 
3937   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3938   if (DstTy == GCDTy) {
3939     // This would just be a copy of the same unmerge.
3940     // TODO: Create extracts, pad with undef and create intermediate merges.
3941     return UnableToLegalize;
3942   }
3943 
3944   auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
3945   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3946   const int PartsPerUnmerge = NumDst / NumUnmerge;
3947 
3948   for (int I = 0; I != NumUnmerge; ++I) {
3949     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3950 
3951     for (int J = 0; J != PartsPerUnmerge; ++J)
3952       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3953     MIB.addUse(Unmerge.getReg(I));
3954   }
3955 
3956   MI.eraseFromParent();
3957   return Legalized;
3958 }
3959 
3960 LegalizerHelper::LegalizeResult
3961 LegalizerHelper::fewerElementsVectorMulo(MachineInstr &MI, unsigned TypeIdx,
3962                                          LLT NarrowTy) {
3963   Register Result = MI.getOperand(0).getReg();
3964   Register Overflow = MI.getOperand(1).getReg();
3965   Register LHS = MI.getOperand(2).getReg();
3966   Register RHS = MI.getOperand(3).getReg();
3967 
3968   LLT SrcTy = MRI.getType(LHS);
3969   if (!SrcTy.isVector())
3970     return UnableToLegalize;
3971 
3972   LLT ElementType = SrcTy.getElementType();
3973   LLT OverflowElementTy = MRI.getType(Overflow).getElementType();
3974   const ElementCount NumResult = SrcTy.getElementCount();
3975   LLT GCDTy = getGCDType(SrcTy, NarrowTy);
3976 
3977   // Unmerge the operands to smaller parts of GCD type.
3978   auto UnmergeLHS = MIRBuilder.buildUnmerge(GCDTy, LHS);
3979   auto UnmergeRHS = MIRBuilder.buildUnmerge(GCDTy, RHS);
3980 
3981   const int NumOps = UnmergeLHS->getNumOperands() - 1;
3982   const ElementCount PartsPerUnmerge = NumResult.divideCoefficientBy(NumOps);
3983   LLT OverflowTy = LLT::scalarOrVector(PartsPerUnmerge, OverflowElementTy);
3984   LLT ResultTy = LLT::scalarOrVector(PartsPerUnmerge, ElementType);
3985 
3986   // Perform the operation over unmerged parts.
3987   SmallVector<Register, 8> ResultParts;
3988   SmallVector<Register, 8> OverflowParts;
3989   for (int I = 0; I != NumOps; ++I) {
3990     Register Operand1 = UnmergeLHS->getOperand(I).getReg();
3991     Register Operand2 = UnmergeRHS->getOperand(I).getReg();
3992     auto PartMul = MIRBuilder.buildInstr(MI.getOpcode(), {ResultTy, OverflowTy},
3993                                          {Operand1, Operand2});
3994     ResultParts.push_back(PartMul->getOperand(0).getReg());
3995     OverflowParts.push_back(PartMul->getOperand(1).getReg());
3996   }
3997 
3998   LLT ResultLCMTy = buildLCMMergePieces(SrcTy, NarrowTy, GCDTy, ResultParts);
3999   LLT OverflowLCMTy =
4000       LLT::scalarOrVector(ResultLCMTy.getElementCount(), OverflowElementTy);
4001 
4002   // Recombine the pieces to the original result and overflow registers.
4003   buildWidenedRemergeToDst(Result, ResultLCMTy, ResultParts);
4004   buildWidenedRemergeToDst(Overflow, OverflowLCMTy, OverflowParts);
4005   MI.eraseFromParent();
4006   return Legalized;
4007 }
4008 
4009 // Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces
4010 // a vector
4011 //
4012 // Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with
4013 // undef as necessary.
4014 //
4015 // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
4016 //   -> <2 x s16>
4017 //
4018 // %4:_(s16) = G_IMPLICIT_DEF
4019 // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
4020 // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
4021 // %7:_(<2 x s16>) = G_IMPLICIT_DEF
4022 // %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7
4023 // %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8
4024 LegalizerHelper::LegalizeResult
4025 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
4026                                           LLT NarrowTy) {
4027   Register DstReg = MI.getOperand(0).getReg();
4028   LLT DstTy = MRI.getType(DstReg);
4029   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
4030   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
4031 
4032   // Break into a common type
4033   SmallVector<Register, 16> Parts;
4034   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
4035     extractGCDType(Parts, GCDTy, MI.getOperand(I).getReg());
4036 
4037   // Build the requested new merge, padding with undef.
4038   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts,
4039                                   TargetOpcode::G_ANYEXT);
4040 
4041   // Pack into the original result register.
4042   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4043 
4044   MI.eraseFromParent();
4045   return Legalized;
4046 }
4047 
4048 LegalizerHelper::LegalizeResult
4049 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
4050                                                            unsigned TypeIdx,
4051                                                            LLT NarrowVecTy) {
4052   Register DstReg = MI.getOperand(0).getReg();
4053   Register SrcVec = MI.getOperand(1).getReg();
4054   Register InsertVal;
4055   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
4056 
4057   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
4058   if (IsInsert)
4059     InsertVal = MI.getOperand(2).getReg();
4060 
4061   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
4062 
4063   // TODO: Handle total scalarization case.
4064   if (!NarrowVecTy.isVector())
4065     return UnableToLegalize;
4066 
4067   LLT VecTy = MRI.getType(SrcVec);
4068 
4069   // If the index is a constant, we can really break this down as you would
4070   // expect, and index into the target size pieces.
4071   int64_t IdxVal;
4072   auto MaybeCst =
4073       getConstantVRegValWithLookThrough(Idx, MRI, /*LookThroughInstrs*/ true,
4074                                         /*HandleFConstants*/ false);
4075   if (MaybeCst) {
4076     IdxVal = MaybeCst->Value.getSExtValue();
4077     // Avoid out of bounds indexing the pieces.
4078     if (IdxVal >= VecTy.getNumElements()) {
4079       MIRBuilder.buildUndef(DstReg);
4080       MI.eraseFromParent();
4081       return Legalized;
4082     }
4083 
4084     SmallVector<Register, 8> VecParts;
4085     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4086 
4087     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4088     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4089                                     TargetOpcode::G_ANYEXT);
4090 
4091     unsigned NewNumElts = NarrowVecTy.getNumElements();
4092 
4093     LLT IdxTy = MRI.getType(Idx);
4094     int64_t PartIdx = IdxVal / NewNumElts;
4095     auto NewIdx =
4096         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4097 
4098     if (IsInsert) {
4099       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4100 
4101       // Use the adjusted index to insert into one of the subvectors.
4102       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4103           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4104       VecParts[PartIdx] = InsertPart.getReg(0);
4105 
4106       // Recombine the inserted subvector with the others to reform the result
4107       // vector.
4108       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4109     } else {
4110       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4111     }
4112 
4113     MI.eraseFromParent();
4114     return Legalized;
4115   }
4116 
4117   // With a variable index, we can't perform the operation in a smaller type, so
4118   // we're forced to expand this.
4119   //
4120   // TODO: We could emit a chain of compare/select to figure out which piece to
4121   // index.
4122   return lowerExtractInsertVectorElt(MI);
4123 }
4124 
4125 LegalizerHelper::LegalizeResult
4126 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4127                                       LLT NarrowTy) {
4128   // FIXME: Don't know how to handle secondary types yet.
4129   if (TypeIdx != 0)
4130     return UnableToLegalize;
4131 
4132   // This implementation doesn't work for atomics. Give up instead of doing
4133   // something invalid.
4134   if (LdStMI.isAtomic())
4135     return UnableToLegalize;
4136 
4137   bool IsLoad = isa<GLoad>(LdStMI);
4138   Register ValReg = LdStMI.getReg(0);
4139   Register AddrReg = LdStMI.getPointerReg();
4140   LLT ValTy = MRI.getType(ValReg);
4141 
4142   // FIXME: Do we need a distinct NarrowMemory legalize action?
4143   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4144     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4145     return UnableToLegalize;
4146   }
4147 
4148   int NumParts = -1;
4149   int NumLeftover = -1;
4150   LLT LeftoverTy;
4151   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4152   if (IsLoad) {
4153     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4154   } else {
4155     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4156                      NarrowLeftoverRegs)) {
4157       NumParts = NarrowRegs.size();
4158       NumLeftover = NarrowLeftoverRegs.size();
4159     }
4160   }
4161 
4162   if (NumParts == -1)
4163     return UnableToLegalize;
4164 
4165   LLT PtrTy = MRI.getType(AddrReg);
4166   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4167 
4168   unsigned TotalSize = ValTy.getSizeInBits();
4169 
4170   // Split the load/store into PartTy sized pieces starting at Offset. If this
4171   // is a load, return the new registers in ValRegs. For a store, each elements
4172   // of ValRegs should be PartTy. Returns the next offset that needs to be
4173   // handled.
4174   auto MMO = LdStMI.getMMO();
4175   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4176                              unsigned Offset) -> unsigned {
4177     MachineFunction &MF = MIRBuilder.getMF();
4178     unsigned PartSize = PartTy.getSizeInBits();
4179     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4180          Offset += PartSize, ++Idx) {
4181       unsigned ByteOffset = Offset / 8;
4182       Register NewAddrReg;
4183 
4184       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4185 
4186       MachineMemOperand *NewMMO =
4187           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4188 
4189       if (IsLoad) {
4190         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4191         ValRegs.push_back(Dst);
4192         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4193       } else {
4194         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4195       }
4196     }
4197 
4198     return Offset;
4199   };
4200 
4201   unsigned HandledOffset = splitTypePieces(NarrowTy, NarrowRegs, 0);
4202 
4203   // Handle the rest of the register if this isn't an even type breakdown.
4204   if (LeftoverTy.isValid())
4205     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, HandledOffset);
4206 
4207   if (IsLoad) {
4208     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4209                 LeftoverTy, NarrowLeftoverRegs);
4210   }
4211 
4212   LdStMI.eraseFromParent();
4213   return Legalized;
4214 }
4215 
4216 LegalizerHelper::LegalizeResult
4217 LegalizerHelper::reduceOperationWidth(MachineInstr &MI, unsigned int TypeIdx,
4218                                       LLT NarrowTy) {
4219   assert(TypeIdx == 0 && "only one type index expected");
4220 
4221   const unsigned Opc = MI.getOpcode();
4222   const int NumDefOps = MI.getNumExplicitDefs();
4223   const int NumSrcOps = MI.getNumOperands() - NumDefOps;
4224   const unsigned Flags = MI.getFlags();
4225   const unsigned NarrowSize = NarrowTy.getSizeInBits();
4226   const LLT NarrowScalarTy = LLT::scalar(NarrowSize);
4227 
4228   assert(MI.getNumOperands() <= 4 && "expected instruction with either 1 "
4229                                      "result and 1-3 sources or 2 results and "
4230                                      "1-2 sources");
4231 
4232   SmallVector<Register, 2> DstRegs;
4233   for (int I = 0; I < NumDefOps; ++I)
4234     DstRegs.push_back(MI.getOperand(I).getReg());
4235 
4236   // First of all check whether we are narrowing (changing the element type)
4237   // or reducing the vector elements
4238   const LLT DstTy = MRI.getType(DstRegs[0]);
4239   const bool IsNarrow = NarrowTy.getScalarType() != DstTy.getScalarType();
4240 
4241   SmallVector<Register, 8> ExtractedRegs[3];
4242   SmallVector<Register, 8> Parts;
4243 
4244   // Break down all the sources into NarrowTy pieces we can operate on. This may
4245   // involve creating merges to a wider type, padded with undef.
4246   for (int I = 0; I != NumSrcOps; ++I) {
4247     Register SrcReg = MI.getOperand(I + NumDefOps).getReg();
4248     LLT SrcTy = MRI.getType(SrcReg);
4249 
4250     // The type to narrow SrcReg to. For narrowing, this is a smaller scalar.
4251     // For fewerElements, this is a smaller vector with the same element type.
4252     LLT OpNarrowTy;
4253     if (IsNarrow) {
4254       OpNarrowTy = NarrowScalarTy;
4255 
4256       // In case of narrowing, we need to cast vectors to scalars for this to
4257       // work properly
4258       // FIXME: Can we do without the bitcast here if we're narrowing?
4259       if (SrcTy.isVector()) {
4260         SrcTy = LLT::scalar(SrcTy.getSizeInBits());
4261         SrcReg = MIRBuilder.buildBitcast(SrcTy, SrcReg).getReg(0);
4262       }
4263     } else {
4264       auto NarrowEC = NarrowTy.isVector() ? NarrowTy.getElementCount()
4265                                           : ElementCount::getFixed(1);
4266       OpNarrowTy = LLT::scalarOrVector(NarrowEC, SrcTy.getScalarType());
4267     }
4268 
4269     LLT GCDTy = extractGCDType(ExtractedRegs[I], SrcTy, OpNarrowTy, SrcReg);
4270 
4271     // Build a sequence of NarrowTy pieces in ExtractedRegs for this operand.
4272     buildLCMMergePieces(SrcTy, OpNarrowTy, GCDTy, ExtractedRegs[I],
4273                         TargetOpcode::G_ANYEXT);
4274   }
4275 
4276   SmallVector<Register, 8> ResultRegs[2];
4277 
4278   // Input operands for each sub-instruction.
4279   SmallVector<SrcOp, 4> InputRegs(NumSrcOps, Register());
4280 
4281   int NumParts = ExtractedRegs[0].size();
4282   const unsigned DstSize = DstTy.getSizeInBits();
4283   const LLT DstScalarTy = LLT::scalar(DstSize);
4284 
4285   // Narrowing needs to use scalar types
4286   LLT DstLCMTy, NarrowDstTy;
4287   if (IsNarrow) {
4288     DstLCMTy = getLCMType(DstScalarTy, NarrowScalarTy);
4289     NarrowDstTy = NarrowScalarTy;
4290   } else {
4291     DstLCMTy = getLCMType(DstTy, NarrowTy);
4292     NarrowDstTy = NarrowTy;
4293   }
4294 
4295   // We widened the source registers to satisfy merge/unmerge size
4296   // constraints. We'll have some extra fully undef parts.
4297   const int NumRealParts = (DstSize + NarrowSize - 1) / NarrowSize;
4298 
4299   for (int I = 0; I != NumRealParts; ++I) {
4300     // Emit this instruction on each of the split pieces.
4301     for (int J = 0; J != NumSrcOps; ++J)
4302       InputRegs[J] = ExtractedRegs[J][I];
4303 
4304     MachineInstrBuilder Inst;
4305     if (NumDefOps == 1)
4306       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy}, InputRegs, Flags);
4307     else
4308       Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy, NarrowDstTy}, InputRegs,
4309                                    Flags);
4310 
4311     for (int J = 0; J != NumDefOps; ++J)
4312       ResultRegs[J].push_back(Inst.getReg(J));
4313   }
4314 
4315   // Fill out the widened result with undef instead of creating instructions
4316   // with undef inputs.
4317   int NumUndefParts = NumParts - NumRealParts;
4318   if (NumUndefParts != 0) {
4319     Register Undef = MIRBuilder.buildUndef(NarrowDstTy).getReg(0);
4320     for (int I = 0; I != NumDefOps; ++I)
4321       ResultRegs[I].append(NumUndefParts, Undef);
4322   }
4323 
4324   // Extract the possibly padded result. Use a scratch register if we need to do
4325   // a final bitcast, otherwise use the original result register.
4326   Register MergeDstReg;
4327   for (int I = 0; I != NumDefOps; ++I) {
4328     if (IsNarrow && DstTy.isVector())
4329       MergeDstReg = MRI.createGenericVirtualRegister(DstScalarTy);
4330     else
4331       MergeDstReg = DstRegs[I];
4332 
4333     buildWidenedRemergeToDst(MergeDstReg, DstLCMTy, ResultRegs[I]);
4334 
4335     // Recast to vector if we narrowed a vector
4336     if (IsNarrow && DstTy.isVector())
4337       MIRBuilder.buildBitcast(DstRegs[I], MergeDstReg);
4338   }
4339 
4340   MI.eraseFromParent();
4341   return Legalized;
4342 }
4343 
4344 LegalizerHelper::LegalizeResult
4345 LegalizerHelper::fewerElementsVectorSextInReg(MachineInstr &MI, unsigned TypeIdx,
4346                                               LLT NarrowTy) {
4347   Register DstReg = MI.getOperand(0).getReg();
4348   Register SrcReg = MI.getOperand(1).getReg();
4349   int64_t Imm = MI.getOperand(2).getImm();
4350 
4351   LLT DstTy = MRI.getType(DstReg);
4352 
4353   SmallVector<Register, 8> Parts;
4354   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
4355   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts);
4356 
4357   for (Register &R : Parts)
4358     R = MIRBuilder.buildSExtInReg(NarrowTy, R, Imm).getReg(0);
4359 
4360   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
4361 
4362   MI.eraseFromParent();
4363   return Legalized;
4364 }
4365 
4366 LegalizerHelper::LegalizeResult
4367 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4368                                      LLT NarrowTy) {
4369   using namespace TargetOpcode;
4370 
4371   switch (MI.getOpcode()) {
4372   case G_IMPLICIT_DEF:
4373     return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy);
4374   case G_TRUNC:
4375   case G_AND:
4376   case G_OR:
4377   case G_XOR:
4378   case G_ADD:
4379   case G_SUB:
4380   case G_MUL:
4381   case G_PTR_ADD:
4382   case G_SMULH:
4383   case G_UMULH:
4384   case G_FADD:
4385   case G_FMUL:
4386   case G_FSUB:
4387   case G_FNEG:
4388   case G_FABS:
4389   case G_FCANONICALIZE:
4390   case G_FDIV:
4391   case G_FREM:
4392   case G_FMA:
4393   case G_FMAD:
4394   case G_FPOW:
4395   case G_FEXP:
4396   case G_FEXP2:
4397   case G_FLOG:
4398   case G_FLOG2:
4399   case G_FLOG10:
4400   case G_FNEARBYINT:
4401   case G_FCEIL:
4402   case G_FFLOOR:
4403   case G_FRINT:
4404   case G_INTRINSIC_ROUND:
4405   case G_INTRINSIC_ROUNDEVEN:
4406   case G_INTRINSIC_TRUNC:
4407   case G_FCOS:
4408   case G_FSIN:
4409   case G_FSQRT:
4410   case G_BSWAP:
4411   case G_BITREVERSE:
4412   case G_SDIV:
4413   case G_UDIV:
4414   case G_SREM:
4415   case G_UREM:
4416   case G_SDIVREM:
4417   case G_UDIVREM:
4418   case G_SMIN:
4419   case G_SMAX:
4420   case G_UMIN:
4421   case G_UMAX:
4422   case G_ABS:
4423   case G_FMINNUM:
4424   case G_FMAXNUM:
4425   case G_FMINNUM_IEEE:
4426   case G_FMAXNUM_IEEE:
4427   case G_FMINIMUM:
4428   case G_FMAXIMUM:
4429   case G_FSHL:
4430   case G_FSHR:
4431   case G_FREEZE:
4432   case G_SADDSAT:
4433   case G_SSUBSAT:
4434   case G_UADDSAT:
4435   case G_USUBSAT:
4436     return reduceOperationWidth(MI, TypeIdx, NarrowTy);
4437   case G_UMULO:
4438   case G_SMULO:
4439     return fewerElementsVectorMulo(MI, TypeIdx, NarrowTy);
4440   case G_SHL:
4441   case G_LSHR:
4442   case G_ASHR:
4443   case G_SSHLSAT:
4444   case G_USHLSAT:
4445   case G_CTLZ:
4446   case G_CTLZ_ZERO_UNDEF:
4447   case G_CTTZ:
4448   case G_CTTZ_ZERO_UNDEF:
4449   case G_CTPOP:
4450   case G_FCOPYSIGN:
4451     return fewerElementsVectorMultiEltType(MI, TypeIdx, NarrowTy);
4452   case G_ZEXT:
4453   case G_SEXT:
4454   case G_ANYEXT:
4455   case G_FPEXT:
4456   case G_FPTRUNC:
4457   case G_SITOFP:
4458   case G_UITOFP:
4459   case G_FPTOSI:
4460   case G_FPTOUI:
4461   case G_INTTOPTR:
4462   case G_PTRTOINT:
4463   case G_ADDRSPACE_CAST:
4464     return fewerElementsVectorCasts(MI, TypeIdx, NarrowTy);
4465   case G_ICMP:
4466   case G_FCMP:
4467     return fewerElementsVectorCmp(MI, TypeIdx, NarrowTy);
4468   case G_SELECT:
4469     return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy);
4470   case G_PHI:
4471     return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy);
4472   case G_UNMERGE_VALUES:
4473     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4474   case G_BUILD_VECTOR:
4475     assert(TypeIdx == 0 && "not a vector type index");
4476     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4477   case G_CONCAT_VECTORS:
4478     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4479       return UnableToLegalize;
4480     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4481   case G_EXTRACT_VECTOR_ELT:
4482   case G_INSERT_VECTOR_ELT:
4483     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4484   case G_LOAD:
4485   case G_STORE:
4486     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4487   case G_SEXT_INREG:
4488     return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy);
4489   GISEL_VECREDUCE_CASES_NONSEQ
4490     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4491   case G_SHUFFLE_VECTOR:
4492     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4493   default:
4494     return UnableToLegalize;
4495   }
4496 }
4497 
4498 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4499     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4500   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4501   if (TypeIdx != 0)
4502     return UnableToLegalize;
4503 
4504   Register DstReg = MI.getOperand(0).getReg();
4505   Register Src1Reg = MI.getOperand(1).getReg();
4506   Register Src2Reg = MI.getOperand(2).getReg();
4507   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4508   LLT DstTy = MRI.getType(DstReg);
4509   LLT Src1Ty = MRI.getType(Src1Reg);
4510   LLT Src2Ty = MRI.getType(Src2Reg);
4511   // The shuffle should be canonicalized by now.
4512   if (DstTy != Src1Ty)
4513     return UnableToLegalize;
4514   if (DstTy != Src2Ty)
4515     return UnableToLegalize;
4516 
4517   if (!isPowerOf2_32(DstTy.getNumElements()))
4518     return UnableToLegalize;
4519 
4520   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4521   // Further legalization attempts will be needed to do split further.
4522   NarrowTy =
4523       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4524   unsigned NewElts = NarrowTy.getNumElements();
4525 
4526   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4527   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4528   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4529   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4530                         SplitSrc2Regs[1]};
4531 
4532   Register Hi, Lo;
4533 
4534   // If Lo or Hi uses elements from at most two of the four input vectors, then
4535   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4536   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4537   SmallVector<int, 16> Ops;
4538   for (unsigned High = 0; High < 2; ++High) {
4539     Register &Output = High ? Hi : Lo;
4540 
4541     // Build a shuffle mask for the output, discovering on the fly which
4542     // input vectors to use as shuffle operands (recorded in InputUsed).
4543     // If building a suitable shuffle vector proves too hard, then bail
4544     // out with useBuildVector set.
4545     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4546     unsigned FirstMaskIdx = High * NewElts;
4547     bool UseBuildVector = false;
4548     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4549       // The mask element.  This indexes into the input.
4550       int Idx = Mask[FirstMaskIdx + MaskOffset];
4551 
4552       // The input vector this mask element indexes into.
4553       unsigned Input = (unsigned)Idx / NewElts;
4554 
4555       if (Input >= array_lengthof(Inputs)) {
4556         // The mask element does not index into any input vector.
4557         Ops.push_back(-1);
4558         continue;
4559       }
4560 
4561       // Turn the index into an offset from the start of the input vector.
4562       Idx -= Input * NewElts;
4563 
4564       // Find or create a shuffle vector operand to hold this input.
4565       unsigned OpNo;
4566       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4567         if (InputUsed[OpNo] == Input) {
4568           // This input vector is already an operand.
4569           break;
4570         } else if (InputUsed[OpNo] == -1U) {
4571           // Create a new operand for this input vector.
4572           InputUsed[OpNo] = Input;
4573           break;
4574         }
4575       }
4576 
4577       if (OpNo >= array_lengthof(InputUsed)) {
4578         // More than two input vectors used!  Give up on trying to create a
4579         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4580         UseBuildVector = true;
4581         break;
4582       }
4583 
4584       // Add the mask index for the new shuffle vector.
4585       Ops.push_back(Idx + OpNo * NewElts);
4586     }
4587 
4588     if (UseBuildVector) {
4589       LLT EltTy = NarrowTy.getElementType();
4590       SmallVector<Register, 16> SVOps;
4591 
4592       // Extract the input elements by hand.
4593       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4594         // The mask element.  This indexes into the input.
4595         int Idx = Mask[FirstMaskIdx + MaskOffset];
4596 
4597         // The input vector this mask element indexes into.
4598         unsigned Input = (unsigned)Idx / NewElts;
4599 
4600         if (Input >= array_lengthof(Inputs)) {
4601           // The mask element is "undef" or indexes off the end of the input.
4602           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4603           continue;
4604         }
4605 
4606         // Turn the index into an offset from the start of the input vector.
4607         Idx -= Input * NewElts;
4608 
4609         // Extract the vector element by hand.
4610         SVOps.push_back(MIRBuilder
4611                             .buildExtractVectorElement(
4612                                 EltTy, Inputs[Input],
4613                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4614                             .getReg(0));
4615       }
4616 
4617       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4618       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4619     } else if (InputUsed[0] == -1U) {
4620       // No input vectors were used! The result is undefined.
4621       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4622     } else {
4623       Register Op0 = Inputs[InputUsed[0]];
4624       // If only one input was used, use an undefined vector for the other.
4625       Register Op1 = InputUsed[1] == -1U
4626                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4627                          : Inputs[InputUsed[1]];
4628       // At least one input vector was used. Create a new shuffle vector.
4629       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4630     }
4631 
4632     Ops.clear();
4633   }
4634 
4635   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4636   MI.eraseFromParent();
4637   return Legalized;
4638 }
4639 
4640 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4641     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4642   unsigned Opc = MI.getOpcode();
4643   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4644          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4645          "Sequential reductions not expected");
4646 
4647   if (TypeIdx != 1)
4648     return UnableToLegalize;
4649 
4650   // The semantics of the normal non-sequential reductions allow us to freely
4651   // re-associate the operation.
4652   Register SrcReg = MI.getOperand(1).getReg();
4653   LLT SrcTy = MRI.getType(SrcReg);
4654   Register DstReg = MI.getOperand(0).getReg();
4655   LLT DstTy = MRI.getType(DstReg);
4656 
4657   if (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0)
4658     return UnableToLegalize;
4659 
4660   SmallVector<Register> SplitSrcs;
4661   const unsigned NumParts = SrcTy.getNumElements() / NarrowTy.getNumElements();
4662   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4663   SmallVector<Register> PartialReductions;
4664   for (unsigned Part = 0; Part < NumParts; ++Part) {
4665     PartialReductions.push_back(
4666         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4667   }
4668 
4669   unsigned ScalarOpc;
4670   switch (Opc) {
4671   case TargetOpcode::G_VECREDUCE_FADD:
4672     ScalarOpc = TargetOpcode::G_FADD;
4673     break;
4674   case TargetOpcode::G_VECREDUCE_FMUL:
4675     ScalarOpc = TargetOpcode::G_FMUL;
4676     break;
4677   case TargetOpcode::G_VECREDUCE_FMAX:
4678     ScalarOpc = TargetOpcode::G_FMAXNUM;
4679     break;
4680   case TargetOpcode::G_VECREDUCE_FMIN:
4681     ScalarOpc = TargetOpcode::G_FMINNUM;
4682     break;
4683   case TargetOpcode::G_VECREDUCE_ADD:
4684     ScalarOpc = TargetOpcode::G_ADD;
4685     break;
4686   case TargetOpcode::G_VECREDUCE_MUL:
4687     ScalarOpc = TargetOpcode::G_MUL;
4688     break;
4689   case TargetOpcode::G_VECREDUCE_AND:
4690     ScalarOpc = TargetOpcode::G_AND;
4691     break;
4692   case TargetOpcode::G_VECREDUCE_OR:
4693     ScalarOpc = TargetOpcode::G_OR;
4694     break;
4695   case TargetOpcode::G_VECREDUCE_XOR:
4696     ScalarOpc = TargetOpcode::G_XOR;
4697     break;
4698   case TargetOpcode::G_VECREDUCE_SMAX:
4699     ScalarOpc = TargetOpcode::G_SMAX;
4700     break;
4701   case TargetOpcode::G_VECREDUCE_SMIN:
4702     ScalarOpc = TargetOpcode::G_SMIN;
4703     break;
4704   case TargetOpcode::G_VECREDUCE_UMAX:
4705     ScalarOpc = TargetOpcode::G_UMAX;
4706     break;
4707   case TargetOpcode::G_VECREDUCE_UMIN:
4708     ScalarOpc = TargetOpcode::G_UMIN;
4709     break;
4710   default:
4711     LLVM_DEBUG(dbgs() << "Can't legalize: unknown reduction kind.\n");
4712     return UnableToLegalize;
4713   }
4714 
4715   // If the types involved are powers of 2, we can generate intermediate vector
4716   // ops, before generating a final reduction operation.
4717   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4718       isPowerOf2_32(NarrowTy.getNumElements())) {
4719     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4720   }
4721 
4722   Register Acc = PartialReductions[0];
4723   for (unsigned Part = 1; Part < NumParts; ++Part) {
4724     if (Part == NumParts - 1) {
4725       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4726                             {Acc, PartialReductions[Part]});
4727     } else {
4728       Acc = MIRBuilder
4729                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4730                 .getReg(0);
4731     }
4732   }
4733   MI.eraseFromParent();
4734   return Legalized;
4735 }
4736 
4737 LegalizerHelper::LegalizeResult
4738 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4739                                         LLT SrcTy, LLT NarrowTy,
4740                                         unsigned ScalarOpc) {
4741   SmallVector<Register> SplitSrcs;
4742   // Split the sources into NarrowTy size pieces.
4743   extractParts(SrcReg, NarrowTy,
4744                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4745   // We're going to do a tree reduction using vector operations until we have
4746   // one NarrowTy size value left.
4747   while (SplitSrcs.size() > 1) {
4748     SmallVector<Register> PartialRdxs;
4749     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4750       Register LHS = SplitSrcs[Idx];
4751       Register RHS = SplitSrcs[Idx + 1];
4752       // Create the intermediate vector op.
4753       Register Res =
4754           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4755       PartialRdxs.push_back(Res);
4756     }
4757     SplitSrcs = std::move(PartialRdxs);
4758   }
4759   // Finally generate the requested NarrowTy based reduction.
4760   Observer.changingInstr(MI);
4761   MI.getOperand(1).setReg(SplitSrcs[0]);
4762   Observer.changedInstr(MI);
4763   return Legalized;
4764 }
4765 
4766 LegalizerHelper::LegalizeResult
4767 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4768                                              const LLT HalfTy, const LLT AmtTy) {
4769 
4770   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4771   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4772   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4773 
4774   if (Amt.isNullValue()) {
4775     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4776     MI.eraseFromParent();
4777     return Legalized;
4778   }
4779 
4780   LLT NVT = HalfTy;
4781   unsigned NVTBits = HalfTy.getSizeInBits();
4782   unsigned VTBits = 2 * NVTBits;
4783 
4784   SrcOp Lo(Register(0)), Hi(Register(0));
4785   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4786     if (Amt.ugt(VTBits)) {
4787       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4788     } else if (Amt.ugt(NVTBits)) {
4789       Lo = MIRBuilder.buildConstant(NVT, 0);
4790       Hi = MIRBuilder.buildShl(NVT, InL,
4791                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4792     } else if (Amt == NVTBits) {
4793       Lo = MIRBuilder.buildConstant(NVT, 0);
4794       Hi = InL;
4795     } else {
4796       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4797       auto OrLHS =
4798           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4799       auto OrRHS = MIRBuilder.buildLShr(
4800           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4801       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4802     }
4803   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4804     if (Amt.ugt(VTBits)) {
4805       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4806     } else if (Amt.ugt(NVTBits)) {
4807       Lo = MIRBuilder.buildLShr(NVT, InH,
4808                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4809       Hi = MIRBuilder.buildConstant(NVT, 0);
4810     } else if (Amt == NVTBits) {
4811       Lo = InH;
4812       Hi = MIRBuilder.buildConstant(NVT, 0);
4813     } else {
4814       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4815 
4816       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4817       auto OrRHS = MIRBuilder.buildShl(
4818           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4819 
4820       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4821       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4822     }
4823   } else {
4824     if (Amt.ugt(VTBits)) {
4825       Hi = Lo = MIRBuilder.buildAShr(
4826           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4827     } else if (Amt.ugt(NVTBits)) {
4828       Lo = MIRBuilder.buildAShr(NVT, InH,
4829                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4830       Hi = MIRBuilder.buildAShr(NVT, InH,
4831                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4832     } else if (Amt == NVTBits) {
4833       Lo = InH;
4834       Hi = MIRBuilder.buildAShr(NVT, InH,
4835                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4836     } else {
4837       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4838 
4839       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4840       auto OrRHS = MIRBuilder.buildShl(
4841           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4842 
4843       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4844       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4845     }
4846   }
4847 
4848   MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4849   MI.eraseFromParent();
4850 
4851   return Legalized;
4852 }
4853 
4854 // TODO: Optimize if constant shift amount.
4855 LegalizerHelper::LegalizeResult
4856 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4857                                    LLT RequestedTy) {
4858   if (TypeIdx == 1) {
4859     Observer.changingInstr(MI);
4860     narrowScalarSrc(MI, RequestedTy, 2);
4861     Observer.changedInstr(MI);
4862     return Legalized;
4863   }
4864 
4865   Register DstReg = MI.getOperand(0).getReg();
4866   LLT DstTy = MRI.getType(DstReg);
4867   if (DstTy.isVector())
4868     return UnableToLegalize;
4869 
4870   Register Amt = MI.getOperand(2).getReg();
4871   LLT ShiftAmtTy = MRI.getType(Amt);
4872   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4873   if (DstEltSize % 2 != 0)
4874     return UnableToLegalize;
4875 
4876   // Ignore the input type. We can only go to exactly half the size of the
4877   // input. If that isn't small enough, the resulting pieces will be further
4878   // legalized.
4879   const unsigned NewBitSize = DstEltSize / 2;
4880   const LLT HalfTy = LLT::scalar(NewBitSize);
4881   const LLT CondTy = LLT::scalar(1);
4882 
4883   if (auto VRegAndVal =
4884           getConstantVRegValWithLookThrough(Amt, MRI, true, false)) {
4885     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4886                                        ShiftAmtTy);
4887   }
4888 
4889   // TODO: Expand with known bits.
4890 
4891   // Handle the fully general expansion by an unknown amount.
4892   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4893 
4894   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4895   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4896   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4897 
4898   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4899   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4900 
4901   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4902   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4903   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4904 
4905   Register ResultRegs[2];
4906   switch (MI.getOpcode()) {
4907   case TargetOpcode::G_SHL: {
4908     // Short: ShAmt < NewBitSize
4909     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4910 
4911     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4912     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4913     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4914 
4915     // Long: ShAmt >= NewBitSize
4916     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4917     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4918 
4919     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4920     auto Hi = MIRBuilder.buildSelect(
4921         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4922 
4923     ResultRegs[0] = Lo.getReg(0);
4924     ResultRegs[1] = Hi.getReg(0);
4925     break;
4926   }
4927   case TargetOpcode::G_LSHR:
4928   case TargetOpcode::G_ASHR: {
4929     // Short: ShAmt < NewBitSize
4930     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4931 
4932     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4933     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4934     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4935 
4936     // Long: ShAmt >= NewBitSize
4937     MachineInstrBuilder HiL;
4938     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4939       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4940     } else {
4941       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4942       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4943     }
4944     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4945                                      {InH, AmtExcess});     // Lo from Hi part.
4946 
4947     auto Lo = MIRBuilder.buildSelect(
4948         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4949 
4950     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4951 
4952     ResultRegs[0] = Lo.getReg(0);
4953     ResultRegs[1] = Hi.getReg(0);
4954     break;
4955   }
4956   default:
4957     llvm_unreachable("not a shift");
4958   }
4959 
4960   MIRBuilder.buildMerge(DstReg, ResultRegs);
4961   MI.eraseFromParent();
4962   return Legalized;
4963 }
4964 
4965 LegalizerHelper::LegalizeResult
4966 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
4967                                        LLT MoreTy) {
4968   assert(TypeIdx == 0 && "Expecting only Idx 0");
4969 
4970   Observer.changingInstr(MI);
4971   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4972     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
4973     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
4974     moreElementsVectorSrc(MI, MoreTy, I);
4975   }
4976 
4977   MachineBasicBlock &MBB = *MI.getParent();
4978   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
4979   moreElementsVectorDst(MI, MoreTy, 0);
4980   Observer.changedInstr(MI);
4981   return Legalized;
4982 }
4983 
4984 LegalizerHelper::LegalizeResult
4985 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
4986                                     LLT MoreTy) {
4987   unsigned Opc = MI.getOpcode();
4988   switch (Opc) {
4989   case TargetOpcode::G_IMPLICIT_DEF:
4990   case TargetOpcode::G_LOAD: {
4991     if (TypeIdx != 0)
4992       return UnableToLegalize;
4993     Observer.changingInstr(MI);
4994     moreElementsVectorDst(MI, MoreTy, 0);
4995     Observer.changedInstr(MI);
4996     return Legalized;
4997   }
4998   case TargetOpcode::G_STORE:
4999     if (TypeIdx != 0)
5000       return UnableToLegalize;
5001     Observer.changingInstr(MI);
5002     moreElementsVectorSrc(MI, MoreTy, 0);
5003     Observer.changedInstr(MI);
5004     return Legalized;
5005   case TargetOpcode::G_AND:
5006   case TargetOpcode::G_OR:
5007   case TargetOpcode::G_XOR:
5008   case TargetOpcode::G_SMIN:
5009   case TargetOpcode::G_SMAX:
5010   case TargetOpcode::G_UMIN:
5011   case TargetOpcode::G_UMAX:
5012   case TargetOpcode::G_FMINNUM:
5013   case TargetOpcode::G_FMAXNUM:
5014   case TargetOpcode::G_FMINNUM_IEEE:
5015   case TargetOpcode::G_FMAXNUM_IEEE:
5016   case TargetOpcode::G_FMINIMUM:
5017   case TargetOpcode::G_FMAXIMUM: {
5018     Observer.changingInstr(MI);
5019     moreElementsVectorSrc(MI, MoreTy, 1);
5020     moreElementsVectorSrc(MI, MoreTy, 2);
5021     moreElementsVectorDst(MI, MoreTy, 0);
5022     Observer.changedInstr(MI);
5023     return Legalized;
5024   }
5025   case TargetOpcode::G_EXTRACT:
5026     if (TypeIdx != 1)
5027       return UnableToLegalize;
5028     Observer.changingInstr(MI);
5029     moreElementsVectorSrc(MI, MoreTy, 1);
5030     Observer.changedInstr(MI);
5031     return Legalized;
5032   case TargetOpcode::G_INSERT:
5033   case TargetOpcode::G_FREEZE:
5034     if (TypeIdx != 0)
5035       return UnableToLegalize;
5036     Observer.changingInstr(MI);
5037     moreElementsVectorSrc(MI, MoreTy, 1);
5038     moreElementsVectorDst(MI, MoreTy, 0);
5039     Observer.changedInstr(MI);
5040     return Legalized;
5041   case TargetOpcode::G_SELECT:
5042     if (TypeIdx != 0)
5043       return UnableToLegalize;
5044     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
5045       return UnableToLegalize;
5046 
5047     Observer.changingInstr(MI);
5048     moreElementsVectorSrc(MI, MoreTy, 2);
5049     moreElementsVectorSrc(MI, MoreTy, 3);
5050     moreElementsVectorDst(MI, MoreTy, 0);
5051     Observer.changedInstr(MI);
5052     return Legalized;
5053   case TargetOpcode::G_UNMERGE_VALUES: {
5054     if (TypeIdx != 1)
5055       return UnableToLegalize;
5056 
5057     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
5058     int NumDst = MI.getNumOperands() - 1;
5059     moreElementsVectorSrc(MI, MoreTy, NumDst);
5060 
5061     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
5062     for (int I = 0; I != NumDst; ++I)
5063       MIB.addDef(MI.getOperand(I).getReg());
5064 
5065     int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits();
5066     for (int I = NumDst; I != NewNumDst; ++I)
5067       MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
5068 
5069     MIB.addUse(MI.getOperand(NumDst).getReg());
5070     MI.eraseFromParent();
5071     return Legalized;
5072   }
5073   case TargetOpcode::G_PHI:
5074     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
5075   case TargetOpcode::G_SHUFFLE_VECTOR:
5076     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
5077   default:
5078     return UnableToLegalize;
5079   }
5080 }
5081 
5082 LegalizerHelper::LegalizeResult
5083 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
5084                                            unsigned int TypeIdx, LLT MoreTy) {
5085   if (TypeIdx != 0)
5086     return UnableToLegalize;
5087 
5088   Register DstReg = MI.getOperand(0).getReg();
5089   Register Src1Reg = MI.getOperand(1).getReg();
5090   Register Src2Reg = MI.getOperand(2).getReg();
5091   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5092   LLT DstTy = MRI.getType(DstReg);
5093   LLT Src1Ty = MRI.getType(Src1Reg);
5094   LLT Src2Ty = MRI.getType(Src2Reg);
5095   unsigned NumElts = DstTy.getNumElements();
5096   unsigned WidenNumElts = MoreTy.getNumElements();
5097 
5098   // Expect a canonicalized shuffle.
5099   if (DstTy != Src1Ty || DstTy != Src2Ty)
5100     return UnableToLegalize;
5101 
5102   moreElementsVectorSrc(MI, MoreTy, 1);
5103   moreElementsVectorSrc(MI, MoreTy, 2);
5104 
5105   // Adjust mask based on new input vector length.
5106   SmallVector<int, 16> NewMask;
5107   for (unsigned I = 0; I != NumElts; ++I) {
5108     int Idx = Mask[I];
5109     if (Idx < static_cast<int>(NumElts))
5110       NewMask.push_back(Idx);
5111     else
5112       NewMask.push_back(Idx - NumElts + WidenNumElts);
5113   }
5114   for (unsigned I = NumElts; I != WidenNumElts; ++I)
5115     NewMask.push_back(-1);
5116   moreElementsVectorDst(MI, MoreTy, 0);
5117   MIRBuilder.setInstrAndDebugLoc(MI);
5118   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5119                                 MI.getOperand(1).getReg(),
5120                                 MI.getOperand(2).getReg(), NewMask);
5121   MI.eraseFromParent();
5122   return Legalized;
5123 }
5124 
5125 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
5126                                         ArrayRef<Register> Src1Regs,
5127                                         ArrayRef<Register> Src2Regs,
5128                                         LLT NarrowTy) {
5129   MachineIRBuilder &B = MIRBuilder;
5130   unsigned SrcParts = Src1Regs.size();
5131   unsigned DstParts = DstRegs.size();
5132 
5133   unsigned DstIdx = 0; // Low bits of the result.
5134   Register FactorSum =
5135       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
5136   DstRegs[DstIdx] = FactorSum;
5137 
5138   unsigned CarrySumPrevDstIdx;
5139   SmallVector<Register, 4> Factors;
5140 
5141   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
5142     // Collect low parts of muls for DstIdx.
5143     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5144          i <= std::min(DstIdx, SrcParts - 1); ++i) {
5145       MachineInstrBuilder Mul =
5146           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
5147       Factors.push_back(Mul.getReg(0));
5148     }
5149     // Collect high parts of muls from previous DstIdx.
5150     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5151          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5152       MachineInstrBuilder Umulh =
5153           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5154       Factors.push_back(Umulh.getReg(0));
5155     }
5156     // Add CarrySum from additions calculated for previous DstIdx.
5157     if (DstIdx != 1) {
5158       Factors.push_back(CarrySumPrevDstIdx);
5159     }
5160 
5161     Register CarrySum;
5162     // Add all factors and accumulate all carries into CarrySum.
5163     if (DstIdx != DstParts - 1) {
5164       MachineInstrBuilder Uaddo =
5165           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5166       FactorSum = Uaddo.getReg(0);
5167       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5168       for (unsigned i = 2; i < Factors.size(); ++i) {
5169         MachineInstrBuilder Uaddo =
5170             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5171         FactorSum = Uaddo.getReg(0);
5172         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5173         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5174       }
5175     } else {
5176       // Since value for the next index is not calculated, neither is CarrySum.
5177       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5178       for (unsigned i = 2; i < Factors.size(); ++i)
5179         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5180     }
5181 
5182     CarrySumPrevDstIdx = CarrySum;
5183     DstRegs[DstIdx] = FactorSum;
5184     Factors.clear();
5185   }
5186 }
5187 
5188 LegalizerHelper::LegalizeResult
5189 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5190                                     LLT NarrowTy) {
5191   if (TypeIdx != 0)
5192     return UnableToLegalize;
5193 
5194   Register DstReg = MI.getOperand(0).getReg();
5195   LLT DstType = MRI.getType(DstReg);
5196   // FIXME: add support for vector types
5197   if (DstType.isVector())
5198     return UnableToLegalize;
5199 
5200   unsigned Opcode = MI.getOpcode();
5201   unsigned OpO, OpE, OpF;
5202   switch (Opcode) {
5203   case TargetOpcode::G_SADDO:
5204   case TargetOpcode::G_SADDE:
5205   case TargetOpcode::G_UADDO:
5206   case TargetOpcode::G_UADDE:
5207   case TargetOpcode::G_ADD:
5208     OpO = TargetOpcode::G_UADDO;
5209     OpE = TargetOpcode::G_UADDE;
5210     OpF = TargetOpcode::G_UADDE;
5211     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5212       OpF = TargetOpcode::G_SADDE;
5213     break;
5214   case TargetOpcode::G_SSUBO:
5215   case TargetOpcode::G_SSUBE:
5216   case TargetOpcode::G_USUBO:
5217   case TargetOpcode::G_USUBE:
5218   case TargetOpcode::G_SUB:
5219     OpO = TargetOpcode::G_USUBO;
5220     OpE = TargetOpcode::G_USUBE;
5221     OpF = TargetOpcode::G_USUBE;
5222     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5223       OpF = TargetOpcode::G_SSUBE;
5224     break;
5225   default:
5226     llvm_unreachable("Unexpected add/sub opcode!");
5227   }
5228 
5229   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5230   unsigned NumDefs = MI.getNumExplicitDefs();
5231   Register Src1 = MI.getOperand(NumDefs).getReg();
5232   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5233   Register CarryDst, CarryIn;
5234   if (NumDefs == 2)
5235     CarryDst = MI.getOperand(1).getReg();
5236   if (MI.getNumOperands() == NumDefs + 3)
5237     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5238 
5239   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5240   LLT LeftoverTy, DummyTy;
5241   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5242   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5243   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5244 
5245   int NarrowParts = Src1Regs.size();
5246   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5247     Src1Regs.push_back(Src1Left[I]);
5248     Src2Regs.push_back(Src2Left[I]);
5249   }
5250   DstRegs.reserve(Src1Regs.size());
5251 
5252   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5253     Register DstReg =
5254         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5255     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5256     // Forward the final carry-out to the destination register
5257     if (i == e - 1 && CarryDst)
5258       CarryOut = CarryDst;
5259 
5260     if (!CarryIn) {
5261       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5262                             {Src1Regs[i], Src2Regs[i]});
5263     } else if (i == e - 1) {
5264       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5265                             {Src1Regs[i], Src2Regs[i], CarryIn});
5266     } else {
5267       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5268                             {Src1Regs[i], Src2Regs[i], CarryIn});
5269     }
5270 
5271     DstRegs.push_back(DstReg);
5272     CarryIn = CarryOut;
5273   }
5274   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5275               makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5276               makeArrayRef(DstRegs).drop_front(NarrowParts));
5277 
5278   MI.eraseFromParent();
5279   return Legalized;
5280 }
5281 
5282 LegalizerHelper::LegalizeResult
5283 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5284   Register DstReg = MI.getOperand(0).getReg();
5285   Register Src1 = MI.getOperand(1).getReg();
5286   Register Src2 = MI.getOperand(2).getReg();
5287 
5288   LLT Ty = MRI.getType(DstReg);
5289   if (Ty.isVector())
5290     return UnableToLegalize;
5291 
5292   unsigned SrcSize = MRI.getType(Src1).getSizeInBits();
5293   unsigned DstSize = Ty.getSizeInBits();
5294   unsigned NarrowSize = NarrowTy.getSizeInBits();
5295   if (DstSize % NarrowSize != 0 || SrcSize % NarrowSize != 0)
5296     return UnableToLegalize;
5297 
5298   unsigned NumDstParts = DstSize / NarrowSize;
5299   unsigned NumSrcParts = SrcSize / NarrowSize;
5300   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5301   unsigned DstTmpParts = NumDstParts * (IsMulHigh ? 2 : 1);
5302 
5303   SmallVector<Register, 2> Src1Parts, Src2Parts;
5304   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5305   extractParts(Src1, NarrowTy, NumSrcParts, Src1Parts);
5306   extractParts(Src2, NarrowTy, NumSrcParts, Src2Parts);
5307   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5308 
5309   // Take only high half of registers if this is high mul.
5310   ArrayRef<Register> DstRegs(
5311       IsMulHigh ? &DstTmpRegs[DstTmpParts / 2] : &DstTmpRegs[0], NumDstParts);
5312   MIRBuilder.buildMerge(DstReg, DstRegs);
5313   MI.eraseFromParent();
5314   return Legalized;
5315 }
5316 
5317 LegalizerHelper::LegalizeResult
5318 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5319                                    LLT NarrowTy) {
5320   if (TypeIdx != 0)
5321     return UnableToLegalize;
5322 
5323   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5324 
5325   Register Src = MI.getOperand(1).getReg();
5326   LLT SrcTy = MRI.getType(Src);
5327 
5328   // If all finite floats fit into the narrowed integer type, we can just swap
5329   // out the result type. This is practically only useful for conversions from
5330   // half to at least 16-bits, so just handle the one case.
5331   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5332       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5333     return UnableToLegalize;
5334 
5335   Observer.changingInstr(MI);
5336   narrowScalarDst(MI, NarrowTy, 0,
5337                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5338   Observer.changedInstr(MI);
5339   return Legalized;
5340 }
5341 
5342 LegalizerHelper::LegalizeResult
5343 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5344                                      LLT NarrowTy) {
5345   if (TypeIdx != 1)
5346     return UnableToLegalize;
5347 
5348   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5349 
5350   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5351   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5352   // NarrowSize.
5353   if (SizeOp1 % NarrowSize != 0)
5354     return UnableToLegalize;
5355   int NumParts = SizeOp1 / NarrowSize;
5356 
5357   SmallVector<Register, 2> SrcRegs, DstRegs;
5358   SmallVector<uint64_t, 2> Indexes;
5359   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5360 
5361   Register OpReg = MI.getOperand(0).getReg();
5362   uint64_t OpStart = MI.getOperand(2).getImm();
5363   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5364   for (int i = 0; i < NumParts; ++i) {
5365     unsigned SrcStart = i * NarrowSize;
5366 
5367     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5368       // No part of the extract uses this subregister, ignore it.
5369       continue;
5370     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5371       // The entire subregister is extracted, forward the value.
5372       DstRegs.push_back(SrcRegs[i]);
5373       continue;
5374     }
5375 
5376     // OpSegStart is where this destination segment would start in OpReg if it
5377     // extended infinitely in both directions.
5378     int64_t ExtractOffset;
5379     uint64_t SegSize;
5380     if (OpStart < SrcStart) {
5381       ExtractOffset = 0;
5382       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5383     } else {
5384       ExtractOffset = OpStart - SrcStart;
5385       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5386     }
5387 
5388     Register SegReg = SrcRegs[i];
5389     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5390       // A genuine extract is needed.
5391       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5392       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5393     }
5394 
5395     DstRegs.push_back(SegReg);
5396   }
5397 
5398   Register DstReg = MI.getOperand(0).getReg();
5399   if (MRI.getType(DstReg).isVector())
5400     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5401   else if (DstRegs.size() > 1)
5402     MIRBuilder.buildMerge(DstReg, DstRegs);
5403   else
5404     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5405   MI.eraseFromParent();
5406   return Legalized;
5407 }
5408 
5409 LegalizerHelper::LegalizeResult
5410 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5411                                     LLT NarrowTy) {
5412   // FIXME: Don't know how to handle secondary types yet.
5413   if (TypeIdx != 0)
5414     return UnableToLegalize;
5415 
5416   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5417   SmallVector<uint64_t, 2> Indexes;
5418   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5419   LLT LeftoverTy;
5420   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5421                LeftoverRegs);
5422 
5423   for (Register Reg : LeftoverRegs)
5424     SrcRegs.push_back(Reg);
5425 
5426   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5427   Register OpReg = MI.getOperand(2).getReg();
5428   uint64_t OpStart = MI.getOperand(3).getImm();
5429   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5430   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5431     unsigned DstStart = I * NarrowSize;
5432 
5433     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5434       // The entire subregister is defined by this insert, forward the new
5435       // value.
5436       DstRegs.push_back(OpReg);
5437       continue;
5438     }
5439 
5440     Register SrcReg = SrcRegs[I];
5441     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5442       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5443       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5444       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5445     }
5446 
5447     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5448       // No part of the insert affects this subregister, forward the original.
5449       DstRegs.push_back(SrcReg);
5450       continue;
5451     }
5452 
5453     // OpSegStart is where this destination segment would start in OpReg if it
5454     // extended infinitely in both directions.
5455     int64_t ExtractOffset, InsertOffset;
5456     uint64_t SegSize;
5457     if (OpStart < DstStart) {
5458       InsertOffset = 0;
5459       ExtractOffset = DstStart - OpStart;
5460       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5461     } else {
5462       InsertOffset = OpStart - DstStart;
5463       ExtractOffset = 0;
5464       SegSize =
5465         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5466     }
5467 
5468     Register SegReg = OpReg;
5469     if (ExtractOffset != 0 || SegSize != OpSize) {
5470       // A genuine extract is needed.
5471       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5472       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5473     }
5474 
5475     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5476     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5477     DstRegs.push_back(DstReg);
5478   }
5479 
5480   uint64_t WideSize = DstRegs.size() * NarrowSize;
5481   Register DstReg = MI.getOperand(0).getReg();
5482   if (WideSize > RegTy.getSizeInBits()) {
5483     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5484     MIRBuilder.buildMerge(MergeReg, DstRegs);
5485     MIRBuilder.buildTrunc(DstReg, MergeReg);
5486   } else
5487     MIRBuilder.buildMerge(DstReg, DstRegs);
5488 
5489   MI.eraseFromParent();
5490   return Legalized;
5491 }
5492 
5493 LegalizerHelper::LegalizeResult
5494 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5495                                    LLT NarrowTy) {
5496   Register DstReg = MI.getOperand(0).getReg();
5497   LLT DstTy = MRI.getType(DstReg);
5498 
5499   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5500 
5501   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5502   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5503   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5504   LLT LeftoverTy;
5505   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5506                     Src0Regs, Src0LeftoverRegs))
5507     return UnableToLegalize;
5508 
5509   LLT Unused;
5510   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5511                     Src1Regs, Src1LeftoverRegs))
5512     llvm_unreachable("inconsistent extractParts result");
5513 
5514   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5515     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5516                                         {Src0Regs[I], Src1Regs[I]});
5517     DstRegs.push_back(Inst.getReg(0));
5518   }
5519 
5520   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5521     auto Inst = MIRBuilder.buildInstr(
5522       MI.getOpcode(),
5523       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5524     DstLeftoverRegs.push_back(Inst.getReg(0));
5525   }
5526 
5527   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5528               LeftoverTy, DstLeftoverRegs);
5529 
5530   MI.eraseFromParent();
5531   return Legalized;
5532 }
5533 
5534 LegalizerHelper::LegalizeResult
5535 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5536                                  LLT NarrowTy) {
5537   if (TypeIdx != 0)
5538     return UnableToLegalize;
5539 
5540   Register DstReg = MI.getOperand(0).getReg();
5541   Register SrcReg = MI.getOperand(1).getReg();
5542 
5543   LLT DstTy = MRI.getType(DstReg);
5544   if (DstTy.isVector())
5545     return UnableToLegalize;
5546 
5547   SmallVector<Register, 8> Parts;
5548   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5549   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5550   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5551 
5552   MI.eraseFromParent();
5553   return Legalized;
5554 }
5555 
5556 LegalizerHelper::LegalizeResult
5557 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5558                                     LLT NarrowTy) {
5559   if (TypeIdx != 0)
5560     return UnableToLegalize;
5561 
5562   Register CondReg = MI.getOperand(1).getReg();
5563   LLT CondTy = MRI.getType(CondReg);
5564   if (CondTy.isVector()) // TODO: Handle vselect
5565     return UnableToLegalize;
5566 
5567   Register DstReg = MI.getOperand(0).getReg();
5568   LLT DstTy = MRI.getType(DstReg);
5569 
5570   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5571   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5572   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5573   LLT LeftoverTy;
5574   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5575                     Src1Regs, Src1LeftoverRegs))
5576     return UnableToLegalize;
5577 
5578   LLT Unused;
5579   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5580                     Src2Regs, Src2LeftoverRegs))
5581     llvm_unreachable("inconsistent extractParts result");
5582 
5583   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5584     auto Select = MIRBuilder.buildSelect(NarrowTy,
5585                                          CondReg, Src1Regs[I], Src2Regs[I]);
5586     DstRegs.push_back(Select.getReg(0));
5587   }
5588 
5589   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5590     auto Select = MIRBuilder.buildSelect(
5591       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5592     DstLeftoverRegs.push_back(Select.getReg(0));
5593   }
5594 
5595   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5596               LeftoverTy, DstLeftoverRegs);
5597 
5598   MI.eraseFromParent();
5599   return Legalized;
5600 }
5601 
5602 LegalizerHelper::LegalizeResult
5603 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5604                                   LLT NarrowTy) {
5605   if (TypeIdx != 1)
5606     return UnableToLegalize;
5607 
5608   Register DstReg = MI.getOperand(0).getReg();
5609   Register SrcReg = MI.getOperand(1).getReg();
5610   LLT DstTy = MRI.getType(DstReg);
5611   LLT SrcTy = MRI.getType(SrcReg);
5612   unsigned NarrowSize = NarrowTy.getSizeInBits();
5613 
5614   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5615     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5616 
5617     MachineIRBuilder &B = MIRBuilder;
5618     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5619     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5620     auto C_0 = B.buildConstant(NarrowTy, 0);
5621     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5622                                 UnmergeSrc.getReg(1), C_0);
5623     auto LoCTLZ = IsUndef ?
5624       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5625       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5626     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5627     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5628     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5629     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5630 
5631     MI.eraseFromParent();
5632     return Legalized;
5633   }
5634 
5635   return UnableToLegalize;
5636 }
5637 
5638 LegalizerHelper::LegalizeResult
5639 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5640                                   LLT NarrowTy) {
5641   if (TypeIdx != 1)
5642     return UnableToLegalize;
5643 
5644   Register DstReg = MI.getOperand(0).getReg();
5645   Register SrcReg = MI.getOperand(1).getReg();
5646   LLT DstTy = MRI.getType(DstReg);
5647   LLT SrcTy = MRI.getType(SrcReg);
5648   unsigned NarrowSize = NarrowTy.getSizeInBits();
5649 
5650   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5651     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5652 
5653     MachineIRBuilder &B = MIRBuilder;
5654     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5655     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5656     auto C_0 = B.buildConstant(NarrowTy, 0);
5657     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5658                                 UnmergeSrc.getReg(0), C_0);
5659     auto HiCTTZ = IsUndef ?
5660       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5661       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5662     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5663     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5664     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5665     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5666 
5667     MI.eraseFromParent();
5668     return Legalized;
5669   }
5670 
5671   return UnableToLegalize;
5672 }
5673 
5674 LegalizerHelper::LegalizeResult
5675 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5676                                    LLT NarrowTy) {
5677   if (TypeIdx != 1)
5678     return UnableToLegalize;
5679 
5680   Register DstReg = MI.getOperand(0).getReg();
5681   LLT DstTy = MRI.getType(DstReg);
5682   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5683   unsigned NarrowSize = NarrowTy.getSizeInBits();
5684 
5685   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5686     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5687 
5688     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5689     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5690     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5691 
5692     MI.eraseFromParent();
5693     return Legalized;
5694   }
5695 
5696   return UnableToLegalize;
5697 }
5698 
5699 LegalizerHelper::LegalizeResult
5700 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5701   unsigned Opc = MI.getOpcode();
5702   const auto &TII = MIRBuilder.getTII();
5703   auto isSupported = [this](const LegalityQuery &Q) {
5704     auto QAction = LI.getAction(Q).Action;
5705     return QAction == Legal || QAction == Libcall || QAction == Custom;
5706   };
5707   switch (Opc) {
5708   default:
5709     return UnableToLegalize;
5710   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5711     // This trivially expands to CTLZ.
5712     Observer.changingInstr(MI);
5713     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5714     Observer.changedInstr(MI);
5715     return Legalized;
5716   }
5717   case TargetOpcode::G_CTLZ: {
5718     Register DstReg = MI.getOperand(0).getReg();
5719     Register SrcReg = MI.getOperand(1).getReg();
5720     LLT DstTy = MRI.getType(DstReg);
5721     LLT SrcTy = MRI.getType(SrcReg);
5722     unsigned Len = SrcTy.getSizeInBits();
5723 
5724     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5725       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5726       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5727       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5728       auto ICmp = MIRBuilder.buildICmp(
5729           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5730       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5731       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5732       MI.eraseFromParent();
5733       return Legalized;
5734     }
5735     // for now, we do this:
5736     // NewLen = NextPowerOf2(Len);
5737     // x = x | (x >> 1);
5738     // x = x | (x >> 2);
5739     // ...
5740     // x = x | (x >>16);
5741     // x = x | (x >>32); // for 64-bit input
5742     // Upto NewLen/2
5743     // return Len - popcount(x);
5744     //
5745     // Ref: "Hacker's Delight" by Henry Warren
5746     Register Op = SrcReg;
5747     unsigned NewLen = PowerOf2Ceil(Len);
5748     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5749       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5750       auto MIBOp = MIRBuilder.buildOr(
5751           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5752       Op = MIBOp.getReg(0);
5753     }
5754     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5755     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5756                         MIBPop);
5757     MI.eraseFromParent();
5758     return Legalized;
5759   }
5760   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5761     // This trivially expands to CTTZ.
5762     Observer.changingInstr(MI);
5763     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5764     Observer.changedInstr(MI);
5765     return Legalized;
5766   }
5767   case TargetOpcode::G_CTTZ: {
5768     Register DstReg = MI.getOperand(0).getReg();
5769     Register SrcReg = MI.getOperand(1).getReg();
5770     LLT DstTy = MRI.getType(DstReg);
5771     LLT SrcTy = MRI.getType(SrcReg);
5772 
5773     unsigned Len = SrcTy.getSizeInBits();
5774     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5775       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5776       // zero.
5777       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5778       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5779       auto ICmp = MIRBuilder.buildICmp(
5780           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5781       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5782       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5783       MI.eraseFromParent();
5784       return Legalized;
5785     }
5786     // for now, we use: { return popcount(~x & (x - 1)); }
5787     // unless the target has ctlz but not ctpop, in which case we use:
5788     // { return 32 - nlz(~x & (x-1)); }
5789     // Ref: "Hacker's Delight" by Henry Warren
5790     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5791     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5792     auto MIBTmp = MIRBuilder.buildAnd(
5793         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5794     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5795         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5796       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5797       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5798                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5799       MI.eraseFromParent();
5800       return Legalized;
5801     }
5802     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5803     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5804     return Legalized;
5805   }
5806   case TargetOpcode::G_CTPOP: {
5807     Register SrcReg = MI.getOperand(1).getReg();
5808     LLT Ty = MRI.getType(SrcReg);
5809     unsigned Size = Ty.getSizeInBits();
5810     MachineIRBuilder &B = MIRBuilder;
5811 
5812     // Count set bits in blocks of 2 bits. Default approach would be
5813     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5814     // We use following formula instead:
5815     // B2Count = val - { (val >> 1) & 0x55555555 }
5816     // since it gives same result in blocks of 2 with one instruction less.
5817     auto C_1 = B.buildConstant(Ty, 1);
5818     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5819     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5820     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5821     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5822     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5823 
5824     // In order to get count in blocks of 4 add values from adjacent block of 2.
5825     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5826     auto C_2 = B.buildConstant(Ty, 2);
5827     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5828     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5829     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5830     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5831     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5832     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5833 
5834     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5835     // addition since count value sits in range {0,...,8} and 4 bits are enough
5836     // to hold such binary values. After addition high 4 bits still hold count
5837     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5838     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5839     auto C_4 = B.buildConstant(Ty, 4);
5840     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5841     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5842     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5843     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5844     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5845 
5846     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5847     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5848     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5849     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5850     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5851 
5852     // Shift count result from 8 high bits to low bits.
5853     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5854     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5855 
5856     MI.eraseFromParent();
5857     return Legalized;
5858   }
5859   }
5860 }
5861 
5862 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5863 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5864                                         Register Reg, unsigned BW) {
5865   return matchUnaryPredicate(
5866       MRI, Reg,
5867       [=](const Constant *C) {
5868         // Null constant here means an undef.
5869         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5870         return !CI || CI->getValue().urem(BW) != 0;
5871       },
5872       /*AllowUndefs*/ true);
5873 }
5874 
5875 LegalizerHelper::LegalizeResult
5876 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5877   Register Dst = MI.getOperand(0).getReg();
5878   Register X = MI.getOperand(1).getReg();
5879   Register Y = MI.getOperand(2).getReg();
5880   Register Z = MI.getOperand(3).getReg();
5881   LLT Ty = MRI.getType(Dst);
5882   LLT ShTy = MRI.getType(Z);
5883 
5884   unsigned BW = Ty.getScalarSizeInBits();
5885 
5886   if (!isPowerOf2_32(BW))
5887     return UnableToLegalize;
5888 
5889   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5890   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5891 
5892   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5893     // fshl X, Y, Z -> fshr X, Y, -Z
5894     // fshr X, Y, Z -> fshl X, Y, -Z
5895     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5896     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5897   } else {
5898     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5899     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5900     auto One = MIRBuilder.buildConstant(ShTy, 1);
5901     if (IsFSHL) {
5902       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5903       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5904     } else {
5905       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5906       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5907     }
5908 
5909     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5910   }
5911 
5912   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5913   MI.eraseFromParent();
5914   return Legalized;
5915 }
5916 
5917 LegalizerHelper::LegalizeResult
5918 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5919   Register Dst = MI.getOperand(0).getReg();
5920   Register X = MI.getOperand(1).getReg();
5921   Register Y = MI.getOperand(2).getReg();
5922   Register Z = MI.getOperand(3).getReg();
5923   LLT Ty = MRI.getType(Dst);
5924   LLT ShTy = MRI.getType(Z);
5925 
5926   const unsigned BW = Ty.getScalarSizeInBits();
5927   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5928 
5929   Register ShX, ShY;
5930   Register ShAmt, InvShAmt;
5931 
5932   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5933   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5934     // fshl: X << C | Y >> (BW - C)
5935     // fshr: X << (BW - C) | Y >> C
5936     // where C = Z % BW is not zero
5937     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5938     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5939     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5940     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5941     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5942   } else {
5943     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5944     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5945     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5946     if (isPowerOf2_32(BW)) {
5947       // Z % BW -> Z & (BW - 1)
5948       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5949       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5950       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5951       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
5952     } else {
5953       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5954       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5955       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
5956     }
5957 
5958     auto One = MIRBuilder.buildConstant(ShTy, 1);
5959     if (IsFSHL) {
5960       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
5961       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
5962       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
5963     } else {
5964       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
5965       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
5966       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
5967     }
5968   }
5969 
5970   MIRBuilder.buildOr(Dst, ShX, ShY);
5971   MI.eraseFromParent();
5972   return Legalized;
5973 }
5974 
5975 LegalizerHelper::LegalizeResult
5976 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
5977   // These operations approximately do the following (while avoiding undefined
5978   // shifts by BW):
5979   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
5980   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
5981   Register Dst = MI.getOperand(0).getReg();
5982   LLT Ty = MRI.getType(Dst);
5983   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
5984 
5985   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5986   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5987 
5988   // TODO: Use smarter heuristic that accounts for vector legalization.
5989   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
5990     return lowerFunnelShiftAsShifts(MI);
5991 
5992   // This only works for powers of 2, fallback to shifts if it fails.
5993   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
5994   if (Result == UnableToLegalize)
5995     return lowerFunnelShiftAsShifts(MI);
5996   return Result;
5997 }
5998 
5999 LegalizerHelper::LegalizeResult
6000 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
6001   Register Dst = MI.getOperand(0).getReg();
6002   Register Src = MI.getOperand(1).getReg();
6003   Register Amt = MI.getOperand(2).getReg();
6004   LLT AmtTy = MRI.getType(Amt);
6005   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6006   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6007   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6008   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6009   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
6010   MI.eraseFromParent();
6011   return Legalized;
6012 }
6013 
6014 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
6015   Register Dst = MI.getOperand(0).getReg();
6016   Register Src = MI.getOperand(1).getReg();
6017   Register Amt = MI.getOperand(2).getReg();
6018   LLT DstTy = MRI.getType(Dst);
6019   LLT SrcTy = MRI.getType(Dst);
6020   LLT AmtTy = MRI.getType(Amt);
6021 
6022   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
6023   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6024 
6025   MIRBuilder.setInstrAndDebugLoc(MI);
6026 
6027   // If a rotate in the other direction is supported, use it.
6028   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6029   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
6030       isPowerOf2_32(EltSizeInBits))
6031     return lowerRotateWithReverseRotate(MI);
6032 
6033   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6034   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
6035   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
6036   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
6037   Register ShVal;
6038   Register RevShiftVal;
6039   if (isPowerOf2_32(EltSizeInBits)) {
6040     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6041     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6042     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6043     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
6044     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6045     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
6046     RevShiftVal =
6047         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
6048   } else {
6049     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6050     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6051     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
6052     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
6053     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6054     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
6055     auto One = MIRBuilder.buildConstant(AmtTy, 1);
6056     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
6057     RevShiftVal =
6058         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
6059   }
6060   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
6061   MI.eraseFromParent();
6062   return Legalized;
6063 }
6064 
6065 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6066 // representation.
6067 LegalizerHelper::LegalizeResult
6068 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
6069   Register Dst = MI.getOperand(0).getReg();
6070   Register Src = MI.getOperand(1).getReg();
6071   const LLT S64 = LLT::scalar(64);
6072   const LLT S32 = LLT::scalar(32);
6073   const LLT S1 = LLT::scalar(1);
6074 
6075   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
6076 
6077   // unsigned cul2f(ulong u) {
6078   //   uint lz = clz(u);
6079   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
6080   //   u = (u << lz) & 0x7fffffffffffffffUL;
6081   //   ulong t = u & 0xffffffffffUL;
6082   //   uint v = (e << 23) | (uint)(u >> 40);
6083   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6084   //   return as_float(v + r);
6085   // }
6086 
6087   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
6088   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
6089 
6090   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
6091 
6092   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
6093   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
6094 
6095   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
6096   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
6097 
6098   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
6099   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
6100 
6101   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
6102 
6103   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
6104   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
6105 
6106   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
6107   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
6108   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
6109 
6110   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
6111   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
6112   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
6113   auto One = MIRBuilder.buildConstant(S32, 1);
6114 
6115   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
6116   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
6117   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
6118   MIRBuilder.buildAdd(Dst, V, R);
6119 
6120   MI.eraseFromParent();
6121   return Legalized;
6122 }
6123 
6124 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
6125   Register Dst = MI.getOperand(0).getReg();
6126   Register Src = MI.getOperand(1).getReg();
6127   LLT DstTy = MRI.getType(Dst);
6128   LLT SrcTy = MRI.getType(Src);
6129 
6130   if (SrcTy == LLT::scalar(1)) {
6131     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6132     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6133     MIRBuilder.buildSelect(Dst, Src, True, False);
6134     MI.eraseFromParent();
6135     return Legalized;
6136   }
6137 
6138   if (SrcTy != LLT::scalar(64))
6139     return UnableToLegalize;
6140 
6141   if (DstTy == LLT::scalar(32)) {
6142     // TODO: SelectionDAG has several alternative expansions to port which may
6143     // be more reasonble depending on the available instructions. If a target
6144     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6145     // intermediate type, this is probably worse.
6146     return lowerU64ToF32BitOps(MI);
6147   }
6148 
6149   return UnableToLegalize;
6150 }
6151 
6152 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6153   Register Dst = MI.getOperand(0).getReg();
6154   Register Src = MI.getOperand(1).getReg();
6155   LLT DstTy = MRI.getType(Dst);
6156   LLT SrcTy = MRI.getType(Src);
6157 
6158   const LLT S64 = LLT::scalar(64);
6159   const LLT S32 = LLT::scalar(32);
6160   const LLT S1 = LLT::scalar(1);
6161 
6162   if (SrcTy == S1) {
6163     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6164     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6165     MIRBuilder.buildSelect(Dst, Src, True, False);
6166     MI.eraseFromParent();
6167     return Legalized;
6168   }
6169 
6170   if (SrcTy != S64)
6171     return UnableToLegalize;
6172 
6173   if (DstTy == S32) {
6174     // signed cl2f(long l) {
6175     //   long s = l >> 63;
6176     //   float r = cul2f((l + s) ^ s);
6177     //   return s ? -r : r;
6178     // }
6179     Register L = Src;
6180     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6181     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6182 
6183     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6184     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6185     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6186 
6187     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6188     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6189                                             MIRBuilder.buildConstant(S64, 0));
6190     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6191     MI.eraseFromParent();
6192     return Legalized;
6193   }
6194 
6195   return UnableToLegalize;
6196 }
6197 
6198 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6199   Register Dst = MI.getOperand(0).getReg();
6200   Register Src = MI.getOperand(1).getReg();
6201   LLT DstTy = MRI.getType(Dst);
6202   LLT SrcTy = MRI.getType(Src);
6203   const LLT S64 = LLT::scalar(64);
6204   const LLT S32 = LLT::scalar(32);
6205 
6206   if (SrcTy != S64 && SrcTy != S32)
6207     return UnableToLegalize;
6208   if (DstTy != S32 && DstTy != S64)
6209     return UnableToLegalize;
6210 
6211   // FPTOSI gives same result as FPTOUI for positive signed integers.
6212   // FPTOUI needs to deal with fp values that convert to unsigned integers
6213   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6214 
6215   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6216   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6217                                                 : APFloat::IEEEdouble(),
6218                     APInt::getNullValue(SrcTy.getSizeInBits()));
6219   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6220 
6221   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6222 
6223   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6224   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6225   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6226   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6227   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6228   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6229   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6230 
6231   const LLT S1 = LLT::scalar(1);
6232 
6233   MachineInstrBuilder FCMP =
6234       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6235   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6236 
6237   MI.eraseFromParent();
6238   return Legalized;
6239 }
6240 
6241 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6242   Register Dst = MI.getOperand(0).getReg();
6243   Register Src = MI.getOperand(1).getReg();
6244   LLT DstTy = MRI.getType(Dst);
6245   LLT SrcTy = MRI.getType(Src);
6246   const LLT S64 = LLT::scalar(64);
6247   const LLT S32 = LLT::scalar(32);
6248 
6249   // FIXME: Only f32 to i64 conversions are supported.
6250   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6251     return UnableToLegalize;
6252 
6253   // Expand f32 -> i64 conversion
6254   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6255   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6256 
6257   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6258 
6259   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6260   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6261 
6262   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6263   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6264 
6265   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6266                                            APInt::getSignMask(SrcEltBits));
6267   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6268   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6269   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6270   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6271 
6272   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6273   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6274   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6275 
6276   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6277   R = MIRBuilder.buildZExt(DstTy, R);
6278 
6279   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6280   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6281   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6282   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6283 
6284   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6285   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6286 
6287   const LLT S1 = LLT::scalar(1);
6288   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6289                                     S1, Exponent, ExponentLoBit);
6290 
6291   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6292 
6293   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6294   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6295 
6296   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6297 
6298   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6299                                           S1, Exponent, ZeroSrcTy);
6300 
6301   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6302   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6303 
6304   MI.eraseFromParent();
6305   return Legalized;
6306 }
6307 
6308 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6309 LegalizerHelper::LegalizeResult
6310 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6311   Register Dst = MI.getOperand(0).getReg();
6312   Register Src = MI.getOperand(1).getReg();
6313 
6314   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6315     return UnableToLegalize;
6316 
6317   const unsigned ExpMask = 0x7ff;
6318   const unsigned ExpBiasf64 = 1023;
6319   const unsigned ExpBiasf16 = 15;
6320   const LLT S32 = LLT::scalar(32);
6321   const LLT S1 = LLT::scalar(1);
6322 
6323   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6324   Register U = Unmerge.getReg(0);
6325   Register UH = Unmerge.getReg(1);
6326 
6327   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6328   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6329 
6330   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6331   // add the f16 bias (15) to get the biased exponent for the f16 format.
6332   E = MIRBuilder.buildAdd(
6333     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6334 
6335   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6336   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6337 
6338   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6339                                        MIRBuilder.buildConstant(S32, 0x1ff));
6340   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6341 
6342   auto Zero = MIRBuilder.buildConstant(S32, 0);
6343   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6344   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6345   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6346 
6347   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6348   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6349   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6350   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6351 
6352   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6353   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6354 
6355   // N = M | (E << 12);
6356   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6357   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6358 
6359   // B = clamp(1-E, 0, 13);
6360   auto One = MIRBuilder.buildConstant(S32, 1);
6361   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6362   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6363   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6364 
6365   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6366                                        MIRBuilder.buildConstant(S32, 0x1000));
6367 
6368   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6369   auto D0 = MIRBuilder.buildShl(S32, D, B);
6370 
6371   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6372                                              D0, SigSetHigh);
6373   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6374   D = MIRBuilder.buildOr(S32, D, D1);
6375 
6376   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6377   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6378 
6379   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6380   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6381 
6382   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6383                                        MIRBuilder.buildConstant(S32, 3));
6384   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6385 
6386   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6387                                        MIRBuilder.buildConstant(S32, 5));
6388   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6389 
6390   V1 = MIRBuilder.buildOr(S32, V0, V1);
6391   V = MIRBuilder.buildAdd(S32, V, V1);
6392 
6393   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6394                                        E, MIRBuilder.buildConstant(S32, 30));
6395   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6396                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6397 
6398   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6399                                          E, MIRBuilder.buildConstant(S32, 1039));
6400   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6401 
6402   // Extract the sign bit.
6403   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6404   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6405 
6406   // Insert the sign bit
6407   V = MIRBuilder.buildOr(S32, Sign, V);
6408 
6409   MIRBuilder.buildTrunc(Dst, V);
6410   MI.eraseFromParent();
6411   return Legalized;
6412 }
6413 
6414 LegalizerHelper::LegalizeResult
6415 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6416   Register Dst = MI.getOperand(0).getReg();
6417   Register Src = MI.getOperand(1).getReg();
6418 
6419   LLT DstTy = MRI.getType(Dst);
6420   LLT SrcTy = MRI.getType(Src);
6421   const LLT S64 = LLT::scalar(64);
6422   const LLT S16 = LLT::scalar(16);
6423 
6424   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6425     return lowerFPTRUNC_F64_TO_F16(MI);
6426 
6427   return UnableToLegalize;
6428 }
6429 
6430 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6431 // multiplication tree.
6432 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6433   Register Dst = MI.getOperand(0).getReg();
6434   Register Src0 = MI.getOperand(1).getReg();
6435   Register Src1 = MI.getOperand(2).getReg();
6436   LLT Ty = MRI.getType(Dst);
6437 
6438   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6439   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6440   MI.eraseFromParent();
6441   return Legalized;
6442 }
6443 
6444 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6445   switch (Opc) {
6446   case TargetOpcode::G_SMIN:
6447     return CmpInst::ICMP_SLT;
6448   case TargetOpcode::G_SMAX:
6449     return CmpInst::ICMP_SGT;
6450   case TargetOpcode::G_UMIN:
6451     return CmpInst::ICMP_ULT;
6452   case TargetOpcode::G_UMAX:
6453     return CmpInst::ICMP_UGT;
6454   default:
6455     llvm_unreachable("not in integer min/max");
6456   }
6457 }
6458 
6459 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6460   Register Dst = MI.getOperand(0).getReg();
6461   Register Src0 = MI.getOperand(1).getReg();
6462   Register Src1 = MI.getOperand(2).getReg();
6463 
6464   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6465   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6466 
6467   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6468   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6469 
6470   MI.eraseFromParent();
6471   return Legalized;
6472 }
6473 
6474 LegalizerHelper::LegalizeResult
6475 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6476   Register Dst = MI.getOperand(0).getReg();
6477   Register Src0 = MI.getOperand(1).getReg();
6478   Register Src1 = MI.getOperand(2).getReg();
6479 
6480   const LLT Src0Ty = MRI.getType(Src0);
6481   const LLT Src1Ty = MRI.getType(Src1);
6482 
6483   const int Src0Size = Src0Ty.getScalarSizeInBits();
6484   const int Src1Size = Src1Ty.getScalarSizeInBits();
6485 
6486   auto SignBitMask = MIRBuilder.buildConstant(
6487     Src0Ty, APInt::getSignMask(Src0Size));
6488 
6489   auto NotSignBitMask = MIRBuilder.buildConstant(
6490     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6491 
6492   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6493   Register And1;
6494   if (Src0Ty == Src1Ty) {
6495     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6496   } else if (Src0Size > Src1Size) {
6497     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6498     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6499     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6500     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6501   } else {
6502     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6503     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6504     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6505     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6506   }
6507 
6508   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6509   // constants are a nan and -0.0, but the final result should preserve
6510   // everything.
6511   unsigned Flags = MI.getFlags();
6512   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6513 
6514   MI.eraseFromParent();
6515   return Legalized;
6516 }
6517 
6518 LegalizerHelper::LegalizeResult
6519 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6520   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6521     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6522 
6523   Register Dst = MI.getOperand(0).getReg();
6524   Register Src0 = MI.getOperand(1).getReg();
6525   Register Src1 = MI.getOperand(2).getReg();
6526   LLT Ty = MRI.getType(Dst);
6527 
6528   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6529     // Insert canonicalizes if it's possible we need to quiet to get correct
6530     // sNaN behavior.
6531 
6532     // Note this must be done here, and not as an optimization combine in the
6533     // absence of a dedicate quiet-snan instruction as we're using an
6534     // omni-purpose G_FCANONICALIZE.
6535     if (!isKnownNeverSNaN(Src0, MRI))
6536       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6537 
6538     if (!isKnownNeverSNaN(Src1, MRI))
6539       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6540   }
6541 
6542   // If there are no nans, it's safe to simply replace this with the non-IEEE
6543   // version.
6544   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6545   MI.eraseFromParent();
6546   return Legalized;
6547 }
6548 
6549 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6550   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6551   Register DstReg = MI.getOperand(0).getReg();
6552   LLT Ty = MRI.getType(DstReg);
6553   unsigned Flags = MI.getFlags();
6554 
6555   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6556                                   Flags);
6557   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6558   MI.eraseFromParent();
6559   return Legalized;
6560 }
6561 
6562 LegalizerHelper::LegalizeResult
6563 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6564   Register DstReg = MI.getOperand(0).getReg();
6565   Register X = MI.getOperand(1).getReg();
6566   const unsigned Flags = MI.getFlags();
6567   const LLT Ty = MRI.getType(DstReg);
6568   const LLT CondTy = Ty.changeElementSize(1);
6569 
6570   // round(x) =>
6571   //  t = trunc(x);
6572   //  d = fabs(x - t);
6573   //  o = copysign(1.0f, x);
6574   //  return t + (d >= 0.5 ? o : 0.0);
6575 
6576   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6577 
6578   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6579   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6580   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6581   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6582   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6583   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6584 
6585   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6586                                   Flags);
6587   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6588 
6589   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6590 
6591   MI.eraseFromParent();
6592   return Legalized;
6593 }
6594 
6595 LegalizerHelper::LegalizeResult
6596 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6597   Register DstReg = MI.getOperand(0).getReg();
6598   Register SrcReg = MI.getOperand(1).getReg();
6599   unsigned Flags = MI.getFlags();
6600   LLT Ty = MRI.getType(DstReg);
6601   const LLT CondTy = Ty.changeElementSize(1);
6602 
6603   // result = trunc(src);
6604   // if (src < 0.0 && src != result)
6605   //   result += -1.0.
6606 
6607   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6608   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6609 
6610   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6611                                   SrcReg, Zero, Flags);
6612   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6613                                       SrcReg, Trunc, Flags);
6614   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6615   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6616 
6617   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6618   MI.eraseFromParent();
6619   return Legalized;
6620 }
6621 
6622 LegalizerHelper::LegalizeResult
6623 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6624   const unsigned NumOps = MI.getNumOperands();
6625   Register DstReg = MI.getOperand(0).getReg();
6626   Register Src0Reg = MI.getOperand(1).getReg();
6627   LLT DstTy = MRI.getType(DstReg);
6628   LLT SrcTy = MRI.getType(Src0Reg);
6629   unsigned PartSize = SrcTy.getSizeInBits();
6630 
6631   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6632   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6633 
6634   for (unsigned I = 2; I != NumOps; ++I) {
6635     const unsigned Offset = (I - 1) * PartSize;
6636 
6637     Register SrcReg = MI.getOperand(I).getReg();
6638     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6639 
6640     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6641       MRI.createGenericVirtualRegister(WideTy);
6642 
6643     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6644     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6645     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6646     ResultReg = NextResult;
6647   }
6648 
6649   if (DstTy.isPointer()) {
6650     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6651           DstTy.getAddressSpace())) {
6652       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6653       return UnableToLegalize;
6654     }
6655 
6656     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6657   }
6658 
6659   MI.eraseFromParent();
6660   return Legalized;
6661 }
6662 
6663 LegalizerHelper::LegalizeResult
6664 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6665   const unsigned NumDst = MI.getNumOperands() - 1;
6666   Register SrcReg = MI.getOperand(NumDst).getReg();
6667   Register Dst0Reg = MI.getOperand(0).getReg();
6668   LLT DstTy = MRI.getType(Dst0Reg);
6669   if (DstTy.isPointer())
6670     return UnableToLegalize; // TODO
6671 
6672   SrcReg = coerceToScalar(SrcReg);
6673   if (!SrcReg)
6674     return UnableToLegalize;
6675 
6676   // Expand scalarizing unmerge as bitcast to integer and shift.
6677   LLT IntTy = MRI.getType(SrcReg);
6678 
6679   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6680 
6681   const unsigned DstSize = DstTy.getSizeInBits();
6682   unsigned Offset = DstSize;
6683   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6684     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6685     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6686     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6687   }
6688 
6689   MI.eraseFromParent();
6690   return Legalized;
6691 }
6692 
6693 /// Lower a vector extract or insert by writing the vector to a stack temporary
6694 /// and reloading the element or vector.
6695 ///
6696 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6697 ///  =>
6698 ///  %stack_temp = G_FRAME_INDEX
6699 ///  G_STORE %vec, %stack_temp
6700 ///  %idx = clamp(%idx, %vec.getNumElements())
6701 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6702 ///  %dst = G_LOAD %element_ptr
6703 LegalizerHelper::LegalizeResult
6704 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6705   Register DstReg = MI.getOperand(0).getReg();
6706   Register SrcVec = MI.getOperand(1).getReg();
6707   Register InsertVal;
6708   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6709     InsertVal = MI.getOperand(2).getReg();
6710 
6711   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6712 
6713   LLT VecTy = MRI.getType(SrcVec);
6714   LLT EltTy = VecTy.getElementType();
6715   if (!EltTy.isByteSized()) { // Not implemented.
6716     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6717     return UnableToLegalize;
6718   }
6719 
6720   unsigned EltBytes = EltTy.getSizeInBytes();
6721   Align VecAlign = getStackTemporaryAlignment(VecTy);
6722   Align EltAlign;
6723 
6724   MachinePointerInfo PtrInfo;
6725   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6726                                         VecAlign, PtrInfo);
6727   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6728 
6729   // Get the pointer to the element, and be sure not to hit undefined behavior
6730   // if the index is out of bounds.
6731   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6732 
6733   int64_t IdxVal;
6734   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6735     int64_t Offset = IdxVal * EltBytes;
6736     PtrInfo = PtrInfo.getWithOffset(Offset);
6737     EltAlign = commonAlignment(VecAlign, Offset);
6738   } else {
6739     // We lose information with a variable offset.
6740     EltAlign = getStackTemporaryAlignment(EltTy);
6741     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6742   }
6743 
6744   if (InsertVal) {
6745     // Write the inserted element
6746     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6747 
6748     // Reload the whole vector.
6749     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6750   } else {
6751     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6752   }
6753 
6754   MI.eraseFromParent();
6755   return Legalized;
6756 }
6757 
6758 LegalizerHelper::LegalizeResult
6759 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6760   Register DstReg = MI.getOperand(0).getReg();
6761   Register Src0Reg = MI.getOperand(1).getReg();
6762   Register Src1Reg = MI.getOperand(2).getReg();
6763   LLT Src0Ty = MRI.getType(Src0Reg);
6764   LLT DstTy = MRI.getType(DstReg);
6765   LLT IdxTy = LLT::scalar(32);
6766 
6767   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6768 
6769   if (DstTy.isScalar()) {
6770     if (Src0Ty.isVector())
6771       return UnableToLegalize;
6772 
6773     // This is just a SELECT.
6774     assert(Mask.size() == 1 && "Expected a single mask element");
6775     Register Val;
6776     if (Mask[0] < 0 || Mask[0] > 1)
6777       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6778     else
6779       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6780     MIRBuilder.buildCopy(DstReg, Val);
6781     MI.eraseFromParent();
6782     return Legalized;
6783   }
6784 
6785   Register Undef;
6786   SmallVector<Register, 32> BuildVec;
6787   LLT EltTy = DstTy.getElementType();
6788 
6789   for (int Idx : Mask) {
6790     if (Idx < 0) {
6791       if (!Undef.isValid())
6792         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6793       BuildVec.push_back(Undef);
6794       continue;
6795     }
6796 
6797     if (Src0Ty.isScalar()) {
6798       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6799     } else {
6800       int NumElts = Src0Ty.getNumElements();
6801       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6802       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6803       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6804       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6805       BuildVec.push_back(Extract.getReg(0));
6806     }
6807   }
6808 
6809   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6810   MI.eraseFromParent();
6811   return Legalized;
6812 }
6813 
6814 LegalizerHelper::LegalizeResult
6815 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6816   const auto &MF = *MI.getMF();
6817   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6818   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6819     return UnableToLegalize;
6820 
6821   Register Dst = MI.getOperand(0).getReg();
6822   Register AllocSize = MI.getOperand(1).getReg();
6823   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6824 
6825   LLT PtrTy = MRI.getType(Dst);
6826   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6827 
6828   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6829   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6830   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6831 
6832   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6833   // have to generate an extra instruction to negate the alloc and then use
6834   // G_PTR_ADD to add the negative offset.
6835   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6836   if (Alignment > Align(1)) {
6837     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6838     AlignMask.negate();
6839     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6840     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6841   }
6842 
6843   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6844   MIRBuilder.buildCopy(SPReg, SPTmp);
6845   MIRBuilder.buildCopy(Dst, SPTmp);
6846 
6847   MI.eraseFromParent();
6848   return Legalized;
6849 }
6850 
6851 LegalizerHelper::LegalizeResult
6852 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6853   Register Dst = MI.getOperand(0).getReg();
6854   Register Src = MI.getOperand(1).getReg();
6855   unsigned Offset = MI.getOperand(2).getImm();
6856 
6857   LLT DstTy = MRI.getType(Dst);
6858   LLT SrcTy = MRI.getType(Src);
6859 
6860   if (DstTy.isScalar() &&
6861       (SrcTy.isScalar() ||
6862        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6863     LLT SrcIntTy = SrcTy;
6864     if (!SrcTy.isScalar()) {
6865       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6866       Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6867     }
6868 
6869     if (Offset == 0)
6870       MIRBuilder.buildTrunc(Dst, Src);
6871     else {
6872       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6873       auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6874       MIRBuilder.buildTrunc(Dst, Shr);
6875     }
6876 
6877     MI.eraseFromParent();
6878     return Legalized;
6879   }
6880 
6881   return UnableToLegalize;
6882 }
6883 
6884 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6885   Register Dst = MI.getOperand(0).getReg();
6886   Register Src = MI.getOperand(1).getReg();
6887   Register InsertSrc = MI.getOperand(2).getReg();
6888   uint64_t Offset = MI.getOperand(3).getImm();
6889 
6890   LLT DstTy = MRI.getType(Src);
6891   LLT InsertTy = MRI.getType(InsertSrc);
6892 
6893   if (InsertTy.isVector() ||
6894       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6895     return UnableToLegalize;
6896 
6897   const DataLayout &DL = MIRBuilder.getDataLayout();
6898   if ((DstTy.isPointer() &&
6899        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6900       (InsertTy.isPointer() &&
6901        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6902     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6903     return UnableToLegalize;
6904   }
6905 
6906   LLT IntDstTy = DstTy;
6907 
6908   if (!DstTy.isScalar()) {
6909     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6910     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6911   }
6912 
6913   if (!InsertTy.isScalar()) {
6914     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6915     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6916   }
6917 
6918   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6919   if (Offset != 0) {
6920     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6921     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6922   }
6923 
6924   APInt MaskVal = APInt::getBitsSetWithWrap(
6925       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6926 
6927   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6928   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6929   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6930 
6931   MIRBuilder.buildCast(Dst, Or);
6932   MI.eraseFromParent();
6933   return Legalized;
6934 }
6935 
6936 LegalizerHelper::LegalizeResult
6937 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
6938   Register Dst0 = MI.getOperand(0).getReg();
6939   Register Dst1 = MI.getOperand(1).getReg();
6940   Register LHS = MI.getOperand(2).getReg();
6941   Register RHS = MI.getOperand(3).getReg();
6942   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
6943 
6944   LLT Ty = MRI.getType(Dst0);
6945   LLT BoolTy = MRI.getType(Dst1);
6946 
6947   if (IsAdd)
6948     MIRBuilder.buildAdd(Dst0, LHS, RHS);
6949   else
6950     MIRBuilder.buildSub(Dst0, LHS, RHS);
6951 
6952   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
6953 
6954   auto Zero = MIRBuilder.buildConstant(Ty, 0);
6955 
6956   // For an addition, the result should be less than one of the operands (LHS)
6957   // if and only if the other operand (RHS) is negative, otherwise there will
6958   // be overflow.
6959   // For a subtraction, the result should be less than one of the operands
6960   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
6961   // otherwise there will be overflow.
6962   auto ResultLowerThanLHS =
6963       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
6964   auto ConditionRHS = MIRBuilder.buildICmp(
6965       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
6966 
6967   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
6968   MI.eraseFromParent();
6969   return Legalized;
6970 }
6971 
6972 LegalizerHelper::LegalizeResult
6973 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
6974   Register Res = MI.getOperand(0).getReg();
6975   Register LHS = MI.getOperand(1).getReg();
6976   Register RHS = MI.getOperand(2).getReg();
6977   LLT Ty = MRI.getType(Res);
6978   bool IsSigned;
6979   bool IsAdd;
6980   unsigned BaseOp;
6981   switch (MI.getOpcode()) {
6982   default:
6983     llvm_unreachable("unexpected addsat/subsat opcode");
6984   case TargetOpcode::G_UADDSAT:
6985     IsSigned = false;
6986     IsAdd = true;
6987     BaseOp = TargetOpcode::G_ADD;
6988     break;
6989   case TargetOpcode::G_SADDSAT:
6990     IsSigned = true;
6991     IsAdd = true;
6992     BaseOp = TargetOpcode::G_ADD;
6993     break;
6994   case TargetOpcode::G_USUBSAT:
6995     IsSigned = false;
6996     IsAdd = false;
6997     BaseOp = TargetOpcode::G_SUB;
6998     break;
6999   case TargetOpcode::G_SSUBSAT:
7000     IsSigned = true;
7001     IsAdd = false;
7002     BaseOp = TargetOpcode::G_SUB;
7003     break;
7004   }
7005 
7006   if (IsSigned) {
7007     // sadd.sat(a, b) ->
7008     //   hi = 0x7fffffff - smax(a, 0)
7009     //   lo = 0x80000000 - smin(a, 0)
7010     //   a + smin(smax(lo, b), hi)
7011     // ssub.sat(a, b) ->
7012     //   lo = smax(a, -1) - 0x7fffffff
7013     //   hi = smin(a, -1) - 0x80000000
7014     //   a - smin(smax(lo, b), hi)
7015     // TODO: AMDGPU can use a "median of 3" instruction here:
7016     //   a +/- med3(lo, b, hi)
7017     uint64_t NumBits = Ty.getScalarSizeInBits();
7018     auto MaxVal =
7019         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
7020     auto MinVal =
7021         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7022     MachineInstrBuilder Hi, Lo;
7023     if (IsAdd) {
7024       auto Zero = MIRBuilder.buildConstant(Ty, 0);
7025       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
7026       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
7027     } else {
7028       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
7029       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
7030                                MaxVal);
7031       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
7032                                MinVal);
7033     }
7034     auto RHSClamped =
7035         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
7036     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
7037   } else {
7038     // uadd.sat(a, b) -> a + umin(~a, b)
7039     // usub.sat(a, b) -> a - umin(a, b)
7040     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
7041     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
7042     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
7043   }
7044 
7045   MI.eraseFromParent();
7046   return Legalized;
7047 }
7048 
7049 LegalizerHelper::LegalizeResult
7050 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7051   Register Res = MI.getOperand(0).getReg();
7052   Register LHS = MI.getOperand(1).getReg();
7053   Register RHS = MI.getOperand(2).getReg();
7054   LLT Ty = MRI.getType(Res);
7055   LLT BoolTy = Ty.changeElementSize(1);
7056   bool IsSigned;
7057   bool IsAdd;
7058   unsigned OverflowOp;
7059   switch (MI.getOpcode()) {
7060   default:
7061     llvm_unreachable("unexpected addsat/subsat opcode");
7062   case TargetOpcode::G_UADDSAT:
7063     IsSigned = false;
7064     IsAdd = true;
7065     OverflowOp = TargetOpcode::G_UADDO;
7066     break;
7067   case TargetOpcode::G_SADDSAT:
7068     IsSigned = true;
7069     IsAdd = true;
7070     OverflowOp = TargetOpcode::G_SADDO;
7071     break;
7072   case TargetOpcode::G_USUBSAT:
7073     IsSigned = false;
7074     IsAdd = false;
7075     OverflowOp = TargetOpcode::G_USUBO;
7076     break;
7077   case TargetOpcode::G_SSUBSAT:
7078     IsSigned = true;
7079     IsAdd = false;
7080     OverflowOp = TargetOpcode::G_SSUBO;
7081     break;
7082   }
7083 
7084   auto OverflowRes =
7085       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7086   Register Tmp = OverflowRes.getReg(0);
7087   Register Ov = OverflowRes.getReg(1);
7088   MachineInstrBuilder Clamp;
7089   if (IsSigned) {
7090     // sadd.sat(a, b) ->
7091     //   {tmp, ov} = saddo(a, b)
7092     //   ov ? (tmp >>s 31) + 0x80000000 : r
7093     // ssub.sat(a, b) ->
7094     //   {tmp, ov} = ssubo(a, b)
7095     //   ov ? (tmp >>s 31) + 0x80000000 : r
7096     uint64_t NumBits = Ty.getScalarSizeInBits();
7097     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7098     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7099     auto MinVal =
7100         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7101     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7102   } else {
7103     // uadd.sat(a, b) ->
7104     //   {tmp, ov} = uaddo(a, b)
7105     //   ov ? 0xffffffff : tmp
7106     // usub.sat(a, b) ->
7107     //   {tmp, ov} = usubo(a, b)
7108     //   ov ? 0 : tmp
7109     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7110   }
7111   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7112 
7113   MI.eraseFromParent();
7114   return Legalized;
7115 }
7116 
7117 LegalizerHelper::LegalizeResult
7118 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7119   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7120           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7121          "Expected shlsat opcode!");
7122   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7123   Register Res = MI.getOperand(0).getReg();
7124   Register LHS = MI.getOperand(1).getReg();
7125   Register RHS = MI.getOperand(2).getReg();
7126   LLT Ty = MRI.getType(Res);
7127   LLT BoolTy = Ty.changeElementSize(1);
7128 
7129   unsigned BW = Ty.getScalarSizeInBits();
7130   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7131   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7132                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7133 
7134   MachineInstrBuilder SatVal;
7135   if (IsSigned) {
7136     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7137     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7138     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7139                                     MIRBuilder.buildConstant(Ty, 0));
7140     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7141   } else {
7142     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7143   }
7144   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7145   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7146 
7147   MI.eraseFromParent();
7148   return Legalized;
7149 }
7150 
7151 LegalizerHelper::LegalizeResult
7152 LegalizerHelper::lowerBswap(MachineInstr &MI) {
7153   Register Dst = MI.getOperand(0).getReg();
7154   Register Src = MI.getOperand(1).getReg();
7155   const LLT Ty = MRI.getType(Src);
7156   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7157   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7158 
7159   // Swap most and least significant byte, set remaining bytes in Res to zero.
7160   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7161   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7162   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7163   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7164 
7165   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7166   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7167     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7168     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7169     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7170     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7171     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7172     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7173     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7174     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7175     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7176     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7177     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7178     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7179   }
7180   Res.getInstr()->getOperand(0).setReg(Dst);
7181 
7182   MI.eraseFromParent();
7183   return Legalized;
7184 }
7185 
7186 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7187 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7188                                  MachineInstrBuilder Src, APInt Mask) {
7189   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7190   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7191   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7192   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7193   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7194   return B.buildOr(Dst, LHS, RHS);
7195 }
7196 
7197 LegalizerHelper::LegalizeResult
7198 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7199   Register Dst = MI.getOperand(0).getReg();
7200   Register Src = MI.getOperand(1).getReg();
7201   const LLT Ty = MRI.getType(Src);
7202   unsigned Size = Ty.getSizeInBits();
7203 
7204   MachineInstrBuilder BSWAP =
7205       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7206 
7207   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7208   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7209   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7210   MachineInstrBuilder Swap4 =
7211       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7212 
7213   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7214   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7215   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7216   MachineInstrBuilder Swap2 =
7217       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7218 
7219   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7220   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7221   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7222   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7223 
7224   MI.eraseFromParent();
7225   return Legalized;
7226 }
7227 
7228 LegalizerHelper::LegalizeResult
7229 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7230   MachineFunction &MF = MIRBuilder.getMF();
7231 
7232   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7233   int NameOpIdx = IsRead ? 1 : 0;
7234   int ValRegIndex = IsRead ? 0 : 1;
7235 
7236   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7237   const LLT Ty = MRI.getType(ValReg);
7238   const MDString *RegStr = cast<MDString>(
7239     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7240 
7241   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7242   if (!PhysReg.isValid())
7243     return UnableToLegalize;
7244 
7245   if (IsRead)
7246     MIRBuilder.buildCopy(ValReg, PhysReg);
7247   else
7248     MIRBuilder.buildCopy(PhysReg, ValReg);
7249 
7250   MI.eraseFromParent();
7251   return Legalized;
7252 }
7253 
7254 LegalizerHelper::LegalizeResult
7255 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7256   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7257   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7258   Register Result = MI.getOperand(0).getReg();
7259   LLT OrigTy = MRI.getType(Result);
7260   auto SizeInBits = OrigTy.getScalarSizeInBits();
7261   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7262 
7263   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7264   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7265   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7266   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7267 
7268   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7269   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7270   MIRBuilder.buildTrunc(Result, Shifted);
7271 
7272   MI.eraseFromParent();
7273   return Legalized;
7274 }
7275 
7276 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7277   // Implement vector G_SELECT in terms of XOR, AND, OR.
7278   Register DstReg = MI.getOperand(0).getReg();
7279   Register MaskReg = MI.getOperand(1).getReg();
7280   Register Op1Reg = MI.getOperand(2).getReg();
7281   Register Op2Reg = MI.getOperand(3).getReg();
7282   LLT DstTy = MRI.getType(DstReg);
7283   LLT MaskTy = MRI.getType(MaskReg);
7284   LLT Op1Ty = MRI.getType(Op1Reg);
7285   if (!DstTy.isVector())
7286     return UnableToLegalize;
7287 
7288   // Vector selects can have a scalar predicate. If so, splat into a vector and
7289   // finish for later legalization attempts to try again.
7290   if (MaskTy.isScalar()) {
7291     Register MaskElt = MaskReg;
7292     if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
7293       MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
7294     // Generate a vector splat idiom to be pattern matched later.
7295     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7296     Observer.changingInstr(MI);
7297     MI.getOperand(1).setReg(ShufSplat.getReg(0));
7298     Observer.changedInstr(MI);
7299     return Legalized;
7300   }
7301 
7302   if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
7303     return UnableToLegalize;
7304   }
7305 
7306   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7307   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7308   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7309   MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7310   MI.eraseFromParent();
7311   return Legalized;
7312 }
7313 
7314 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7315   // Split DIVREM into individual instructions.
7316   unsigned Opcode = MI.getOpcode();
7317 
7318   MIRBuilder.buildInstr(
7319       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7320                                         : TargetOpcode::G_UDIV,
7321       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7322   MIRBuilder.buildInstr(
7323       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7324                                         : TargetOpcode::G_UREM,
7325       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7326   MI.eraseFromParent();
7327   return Legalized;
7328 }
7329 
7330 LegalizerHelper::LegalizeResult
7331 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7332   // Expand %res = G_ABS %a into:
7333   // %v1 = G_ASHR %a, scalar_size-1
7334   // %v2 = G_ADD %a, %v1
7335   // %res = G_XOR %v2, %v1
7336   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7337   Register OpReg = MI.getOperand(1).getReg();
7338   auto ShiftAmt =
7339       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7340   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7341   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7342   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7343   MI.eraseFromParent();
7344   return Legalized;
7345 }
7346 
7347 LegalizerHelper::LegalizeResult
7348 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7349   // Expand %res = G_ABS %a into:
7350   // %v1 = G_CONSTANT 0
7351   // %v2 = G_SUB %v1, %a
7352   // %res = G_SMAX %a, %v2
7353   Register SrcReg = MI.getOperand(1).getReg();
7354   LLT Ty = MRI.getType(SrcReg);
7355   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7356   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7357   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7358   MI.eraseFromParent();
7359   return Legalized;
7360 }
7361 
7362 LegalizerHelper::LegalizeResult LegalizerHelper::lowerIsNaN(MachineInstr &MI) {
7363   Register Dst = MI.getOperand(0).getReg();
7364   Register Src = MI.getOperand(1).getReg();
7365   LLT SrcTy = MRI.getType(Src);
7366   if (MI.getFlags() & MachineInstr::NoFPExcept) {
7367     // Lower to an unordered comparison.
7368     auto Zero = MIRBuilder.buildFConstant(SrcTy, 0.0);
7369     MIRBuilder.buildFCmp(CmpInst::Predicate::FCMP_UNO, Dst, Src, Zero);
7370     MI.eraseFromParent();
7371     return Legalized;
7372   }
7373 
7374   // Use integer operations to avoid traps if the argument is SNaN.
7375 
7376   // NaN has all exp bits set and a non zero significand. Therefore:
7377   // isnan(V) == exp mask < abs(V)
7378   auto FPToSI = MIRBuilder.buildFPTOSI(SrcTy, Src);
7379   auto Mask = APInt::getSignedMaxValue(SrcTy.getScalarSizeInBits());
7380   auto MaskCst = MIRBuilder.buildConstant(SrcTy, Mask);
7381   auto AbsV = MIRBuilder.buildAnd(SrcTy, FPToSI, MaskCst);
7382   auto *FloatTy = getFloatTypeForLLT(MI.getMF()->getFunction().getContext(),
7383                                      SrcTy.getScalarType());
7384   if (!FloatTy)
7385     return UnableToLegalize;
7386   auto ExpMask = APFloat::getInf(FloatTy->getFltSemantics()).bitcastToAPInt();
7387   auto ExpMaskCst = MIRBuilder.buildConstant(SrcTy, ExpMask);
7388   MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, Dst, ExpMaskCst, AbsV);
7389   MI.eraseFromParent();
7390   return Legalized;
7391 }
7392