1 //===- InstCombineVectorOps.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements instcombine for ExtractElement, InsertElement and
10 // ShuffleVector.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "InstCombineInternal.h"
15 #include "llvm/ADT/APInt.h"
16 #include "llvm/ADT/ArrayRef.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallBitVector.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/InstructionSimplify.h"
23 #include "llvm/Analysis/VectorUtils.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/Constant.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/InstrTypes.h"
29 #include "llvm/IR/Instruction.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/Operator.h"
32 #include "llvm/IR/PatternMatch.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/IR/User.h"
35 #include "llvm/IR/Value.h"
36 #include "llvm/Support/Casting.h"
37 #include "llvm/Support/ErrorHandling.h"
38 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
39 #include "llvm/Transforms/InstCombine/InstCombiner.h"
40 #include <cassert>
41 #include <cstdint>
42 #include <iterator>
43 #include <utility>
44 
45 using namespace llvm;
46 using namespace PatternMatch;
47 
48 #define DEBUG_TYPE "instcombine"
49 
50 STATISTIC(NumAggregateReconstructionsSimplified,
51           "Number of aggregate reconstructions turned into reuse of the "
52           "original aggregate");
53 
54 /// Return true if the value is cheaper to scalarize than it is to leave as a
55 /// vector operation. IsConstantExtractIndex indicates whether we are extracting
56 /// one known element from a vector constant.
57 ///
58 /// FIXME: It's possible to create more instructions than previously existed.
59 static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
60   // If we can pick a scalar constant value out of a vector, that is free.
61   if (auto *C = dyn_cast<Constant>(V))
62     return IsConstantExtractIndex || C->getSplatValue();
63 
64   // An insertelement to the same constant index as our extract will simplify
65   // to the scalar inserted element. An insertelement to a different constant
66   // index is irrelevant to our extract.
67   if (match(V, m_InsertElt(m_Value(), m_Value(), m_ConstantInt())))
68     return IsConstantExtractIndex;
69 
70   if (match(V, m_OneUse(m_Load(m_Value()))))
71     return true;
72 
73   if (match(V, m_OneUse(m_UnOp())))
74     return true;
75 
76   Value *V0, *V1;
77   if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1)))))
78     if (cheapToScalarize(V0, IsConstantExtractIndex) ||
79         cheapToScalarize(V1, IsConstantExtractIndex))
80       return true;
81 
82   CmpInst::Predicate UnusedPred;
83   if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1)))))
84     if (cheapToScalarize(V0, IsConstantExtractIndex) ||
85         cheapToScalarize(V1, IsConstantExtractIndex))
86       return true;
87 
88   return false;
89 }
90 
91 // If we have a PHI node with a vector type that is only used to feed
92 // itself and be an operand of extractelement at a constant location,
93 // try to replace the PHI of the vector type with a PHI of a scalar type.
94 Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
95                                             PHINode *PN) {
96   SmallVector<Instruction *, 2> Extracts;
97   // The users we want the PHI to have are:
98   // 1) The EI ExtractElement (we already know this)
99   // 2) Possibly more ExtractElements with the same index.
100   // 3) Another operand, which will feed back into the PHI.
101   Instruction *PHIUser = nullptr;
102   for (auto U : PN->users()) {
103     if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) {
104       if (EI.getIndexOperand() == EU->getIndexOperand())
105         Extracts.push_back(EU);
106       else
107         return nullptr;
108     } else if (!PHIUser) {
109       PHIUser = cast<Instruction>(U);
110     } else {
111       return nullptr;
112     }
113   }
114 
115   if (!PHIUser)
116     return nullptr;
117 
118   // Verify that this PHI user has one use, which is the PHI itself,
119   // and that it is a binary operation which is cheap to scalarize.
120   // otherwise return nullptr.
121   if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
122       !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))
123     return nullptr;
124 
125   // Create a scalar PHI node that will replace the vector PHI node
126   // just before the current PHI node.
127   PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith(
128       PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN));
129   // Scalarize each PHI operand.
130   for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
131     Value *PHIInVal = PN->getIncomingValue(i);
132     BasicBlock *inBB = PN->getIncomingBlock(i);
133     Value *Elt = EI.getIndexOperand();
134     // If the operand is the PHI induction variable:
135     if (PHIInVal == PHIUser) {
136       // Scalarize the binary operation. Its first operand is the
137       // scalar PHI, and the second operand is extracted from the other
138       // vector operand.
139       BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
140       unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
141       Value *Op = InsertNewInstWith(
142           ExtractElementInst::Create(B0->getOperand(opId), Elt,
143                                      B0->getOperand(opId)->getName() + ".Elt"),
144           *B0);
145       Value *newPHIUser = InsertNewInstWith(
146           BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(),
147                                                 scalarPHI, Op, B0), *B0);
148       scalarPHI->addIncoming(newPHIUser, inBB);
149     } else {
150       // Scalarize PHI input:
151       Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, "");
152       // Insert the new instruction into the predecessor basic block.
153       Instruction *pos = dyn_cast<Instruction>(PHIInVal);
154       BasicBlock::iterator InsertPos;
155       if (pos && !isa<PHINode>(pos)) {
156         InsertPos = ++pos->getIterator();
157       } else {
158         InsertPos = inBB->getFirstInsertionPt();
159       }
160 
161       InsertNewInstWith(newEI, *InsertPos);
162 
163       scalarPHI->addIncoming(newEI, inBB);
164     }
165   }
166 
167   for (auto E : Extracts)
168     replaceInstUsesWith(*E, scalarPHI);
169 
170   return &EI;
171 }
172 
173 static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
174                                       InstCombiner::BuilderTy &Builder,
175                                       bool IsBigEndian) {
176   Value *X;
177   uint64_t ExtIndexC;
178   if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) ||
179       !X->getType()->isVectorTy() ||
180       !match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC)))
181     return nullptr;
182 
183   // If this extractelement is using a bitcast from a vector of the same number
184   // of elements, see if we can find the source element from the source vector:
185   // extelt (bitcast VecX), IndexC --> bitcast X[IndexC]
186   auto *SrcTy = cast<VectorType>(X->getType());
187   Type *DestTy = Ext.getType();
188   ElementCount NumSrcElts = SrcTy->getElementCount();
189   ElementCount NumElts =
190       cast<VectorType>(Ext.getVectorOperandType())->getElementCount();
191   if (NumSrcElts == NumElts)
192     if (Value *Elt = findScalarElement(X, ExtIndexC))
193       return new BitCastInst(Elt, DestTy);
194 
195   assert(NumSrcElts.isScalable() == NumElts.isScalable() &&
196          "Src and Dst must be the same sort of vector type");
197 
198   // If the source elements are wider than the destination, try to shift and
199   // truncate a subset of scalar bits of an insert op.
200   if (NumSrcElts.getKnownMinValue() < NumElts.getKnownMinValue()) {
201     Value *Scalar;
202     uint64_t InsIndexC;
203     if (!match(X, m_InsertElt(m_Value(), m_Value(Scalar),
204                               m_ConstantInt(InsIndexC))))
205       return nullptr;
206 
207     // The extract must be from the subset of vector elements that we inserted
208     // into. Example: if we inserted element 1 of a <2 x i64> and we are
209     // extracting an i16 (narrowing ratio = 4), then this extract must be from 1
210     // of elements 4-7 of the bitcasted vector.
211     unsigned NarrowingRatio =
212         NumElts.getKnownMinValue() / NumSrcElts.getKnownMinValue();
213     if (ExtIndexC / NarrowingRatio != InsIndexC)
214       return nullptr;
215 
216     // We are extracting part of the original scalar. How that scalar is
217     // inserted into the vector depends on the endian-ness. Example:
218     //              Vector Byte Elt Index:    0  1  2  3  4  5  6  7
219     //                                       +--+--+--+--+--+--+--+--+
220     // inselt <2 x i32> V, <i32> S, 1:       |V0|V1|V2|V3|S0|S1|S2|S3|
221     // extelt <4 x i16> V', 3:               |                 |S2|S3|
222     //                                       +--+--+--+--+--+--+--+--+
223     // If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value.
224     // If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value.
225     // In this example, we must right-shift little-endian. Big-endian is just a
226     // truncate.
227     unsigned Chunk = ExtIndexC % NarrowingRatio;
228     if (IsBigEndian)
229       Chunk = NarrowingRatio - 1 - Chunk;
230 
231     // Bail out if this is an FP vector to FP vector sequence. That would take
232     // more instructions than we started with unless there is no shift, and it
233     // may not be handled as well in the backend.
234     bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy();
235     bool NeedDestBitcast = DestTy->isFloatingPointTy();
236     if (NeedSrcBitcast && NeedDestBitcast)
237       return nullptr;
238 
239     unsigned SrcWidth = SrcTy->getScalarSizeInBits();
240     unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
241     unsigned ShAmt = Chunk * DestWidth;
242 
243     // TODO: This limitation is more strict than necessary. We could sum the
244     // number of new instructions and subtract the number eliminated to know if
245     // we can proceed.
246     if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse())
247       if (NeedSrcBitcast || NeedDestBitcast)
248         return nullptr;
249 
250     if (NeedSrcBitcast) {
251       Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth);
252       Scalar = Builder.CreateBitCast(Scalar, SrcIntTy);
253     }
254 
255     if (ShAmt) {
256       // Bail out if we could end with more instructions than we started with.
257       if (!Ext.getVectorOperand()->hasOneUse())
258         return nullptr;
259       Scalar = Builder.CreateLShr(Scalar, ShAmt);
260     }
261 
262     if (NeedDestBitcast) {
263       Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth);
264       return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy);
265     }
266     return new TruncInst(Scalar, DestTy);
267   }
268 
269   return nullptr;
270 }
271 
272 /// Find elements of V demanded by UserInstr.
273 static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
274   unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
275 
276   // Conservatively assume that all elements are needed.
277   APInt UsedElts(APInt::getAllOnesValue(VWidth));
278 
279   switch (UserInstr->getOpcode()) {
280   case Instruction::ExtractElement: {
281     ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);
282     assert(EEI->getVectorOperand() == V);
283     ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());
284     if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {
285       UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());
286     }
287     break;
288   }
289   case Instruction::ShuffleVector: {
290     ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);
291     unsigned MaskNumElts =
292         cast<FixedVectorType>(UserInstr->getType())->getNumElements();
293 
294     UsedElts = APInt(VWidth, 0);
295     for (unsigned i = 0; i < MaskNumElts; i++) {
296       unsigned MaskVal = Shuffle->getMaskValue(i);
297       if (MaskVal == -1u || MaskVal >= 2 * VWidth)
298         continue;
299       if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))
300         UsedElts.setBit(MaskVal);
301       if (Shuffle->getOperand(1) == V &&
302           ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))
303         UsedElts.setBit(MaskVal - VWidth);
304     }
305     break;
306   }
307   default:
308     break;
309   }
310   return UsedElts;
311 }
312 
313 /// Find union of elements of V demanded by all its users.
314 /// If it is known by querying findDemandedEltsBySingleUser that
315 /// no user demands an element of V, then the corresponding bit
316 /// remains unset in the returned value.
317 static APInt findDemandedEltsByAllUsers(Value *V) {
318   unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
319 
320   APInt UnionUsedElts(VWidth, 0);
321   for (const Use &U : V->uses()) {
322     if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
323       UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
324     } else {
325       UnionUsedElts = APInt::getAllOnesValue(VWidth);
326       break;
327     }
328 
329     if (UnionUsedElts.isAllOnesValue())
330       break;
331   }
332 
333   return UnionUsedElts;
334 }
335 
336 Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
337   Value *SrcVec = EI.getVectorOperand();
338   Value *Index = EI.getIndexOperand();
339   if (Value *V = SimplifyExtractElementInst(SrcVec, Index,
340                                             SQ.getWithInstruction(&EI)))
341     return replaceInstUsesWith(EI, V);
342 
343   // If extracting a specified index from the vector, see if we can recursively
344   // find a previously computed scalar that was inserted into the vector.
345   auto *IndexC = dyn_cast<ConstantInt>(Index);
346   if (IndexC) {
347     ElementCount EC = EI.getVectorOperandType()->getElementCount();
348     unsigned NumElts = EC.getKnownMinValue();
349 
350     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(SrcVec)) {
351       Intrinsic::ID IID = II->getIntrinsicID();
352       // Index needs to be lower than the minimum size of the vector, because
353       // for scalable vector, the vector size is known at run time.
354       if (IID == Intrinsic::experimental_stepvector &&
355           IndexC->getValue().ult(NumElts)) {
356         Type *Ty = EI.getType();
357         unsigned BitWidth = Ty->getIntegerBitWidth();
358         Value *Idx;
359         // Return index when its value does not exceed the allowed limit
360         // for the element type of the vector, otherwise return undefined.
361         if (IndexC->getValue().getActiveBits() <= BitWidth)
362           Idx = ConstantInt::get(Ty, IndexC->getValue().zextOrTrunc(BitWidth));
363         else
364           Idx = UndefValue::get(Ty);
365         return replaceInstUsesWith(EI, Idx);
366       }
367     }
368     // InstSimplify should handle cases where the index is invalid.
369     // For fixed-length vector, it's invalid to extract out-of-range element.
370     if (!EC.isScalable() && IndexC->getValue().uge(NumElts))
371       return nullptr;
372 
373     // This instruction only demands the single element from the input vector.
374     // Skip for scalable type, the number of elements is unknown at
375     // compile-time.
376     if (!EC.isScalable() && NumElts != 1) {
377       // If the input vector has a single use, simplify it based on this use
378       // property.
379       if (SrcVec->hasOneUse()) {
380         APInt UndefElts(NumElts, 0);
381         APInt DemandedElts(NumElts, 0);
382         DemandedElts.setBit(IndexC->getZExtValue());
383         if (Value *V =
384                 SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts))
385           return replaceOperand(EI, 0, V);
386       } else {
387         // If the input vector has multiple uses, simplify it based on a union
388         // of all elements used.
389         APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
390         if (!DemandedElts.isAllOnesValue()) {
391           APInt UndefElts(NumElts, 0);
392           if (Value *V = SimplifyDemandedVectorElts(
393                   SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
394                   true /* AllowMultipleUsers */)) {
395             if (V != SrcVec) {
396               SrcVec->replaceAllUsesWith(V);
397               return &EI;
398             }
399           }
400         }
401       }
402     }
403     if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian()))
404       return I;
405 
406     // If there's a vector PHI feeding a scalar use through this extractelement
407     // instruction, try to scalarize the PHI.
408     if (auto *Phi = dyn_cast<PHINode>(SrcVec))
409       if (Instruction *ScalarPHI = scalarizePHI(EI, Phi))
410         return ScalarPHI;
411   }
412 
413   // TODO come up with a n-ary matcher that subsumes both unary and
414   // binary matchers.
415   UnaryOperator *UO;
416   if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, IndexC)) {
417     // extelt (unop X), Index --> unop (extelt X, Index)
418     Value *X = UO->getOperand(0);
419     Value *E = Builder.CreateExtractElement(X, Index);
420     return UnaryOperator::CreateWithCopiedFlags(UO->getOpcode(), E, UO);
421   }
422 
423   BinaryOperator *BO;
424   if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) {
425     // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index)
426     Value *X = BO->getOperand(0), *Y = BO->getOperand(1);
427     Value *E0 = Builder.CreateExtractElement(X, Index);
428     Value *E1 = Builder.CreateExtractElement(Y, Index);
429     return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO);
430   }
431 
432   Value *X, *Y;
433   CmpInst::Predicate Pred;
434   if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
435       cheapToScalarize(SrcVec, IndexC)) {
436     // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index)
437     Value *E0 = Builder.CreateExtractElement(X, Index);
438     Value *E1 = Builder.CreateExtractElement(Y, Index);
439     return CmpInst::Create(cast<CmpInst>(SrcVec)->getOpcode(), Pred, E0, E1);
440   }
441 
442   if (auto *I = dyn_cast<Instruction>(SrcVec)) {
443     if (auto *IE = dyn_cast<InsertElementInst>(I)) {
444       // Extracting the inserted element?
445       if (IE->getOperand(2) == Index)
446         return replaceInstUsesWith(EI, IE->getOperand(1));
447       // If the inserted and extracted elements are constants, they must not
448       // be the same value, extract from the pre-inserted value instead.
449       if (isa<Constant>(IE->getOperand(2)) && IndexC)
450         return replaceOperand(EI, 0, IE->getOperand(0));
451     } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
452       auto *VecType = cast<VectorType>(GEP->getType());
453       ElementCount EC = VecType->getElementCount();
454       uint64_t IdxVal = IndexC ? IndexC->getZExtValue() : 0;
455       if (IndexC && IdxVal < EC.getKnownMinValue() && GEP->hasOneUse()) {
456         // Find out why we have a vector result - these are a few examples:
457         //  1. We have a scalar pointer and a vector of indices, or
458         //  2. We have a vector of pointers and a scalar index, or
459         //  3. We have a vector of pointers and a vector of indices, etc.
460         // Here we only consider combining when there is exactly one vector
461         // operand, since the optimization is less obviously a win due to
462         // needing more than one extractelements.
463 
464         unsigned VectorOps =
465             llvm::count_if(GEP->operands(), [](const Value *V) {
466               return isa<VectorType>(V->getType());
467             });
468         if (VectorOps > 1)
469           return nullptr;
470         assert(VectorOps == 1 && "Expected exactly one vector GEP operand!");
471 
472         Value *NewPtr = GEP->getPointerOperand();
473         if (isa<VectorType>(NewPtr->getType()))
474           NewPtr = Builder.CreateExtractElement(NewPtr, IndexC);
475 
476         SmallVector<Value *> NewOps;
477         for (unsigned I = 1; I != GEP->getNumOperands(); ++I) {
478           Value *Op = GEP->getOperand(I);
479           if (isa<VectorType>(Op->getType()))
480             NewOps.push_back(Builder.CreateExtractElement(Op, IndexC));
481           else
482             NewOps.push_back(Op);
483         }
484 
485         GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
486             cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr,
487             NewOps);
488         NewGEP->setIsInBounds(GEP->isInBounds());
489         return NewGEP;
490       }
491       return nullptr;
492     } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
493       // If this is extracting an element from a shufflevector, figure out where
494       // it came from and extract from the appropriate input element instead.
495       // Restrict the following transformation to fixed-length vector.
496       if (isa<FixedVectorType>(SVI->getType()) && isa<ConstantInt>(Index)) {
497         int SrcIdx =
498             SVI->getMaskValue(cast<ConstantInt>(Index)->getZExtValue());
499         Value *Src;
500         unsigned LHSWidth = cast<FixedVectorType>(SVI->getOperand(0)->getType())
501                                 ->getNumElements();
502 
503         if (SrcIdx < 0)
504           return replaceInstUsesWith(EI, UndefValue::get(EI.getType()));
505         if (SrcIdx < (int)LHSWidth)
506           Src = SVI->getOperand(0);
507         else {
508           SrcIdx -= LHSWidth;
509           Src = SVI->getOperand(1);
510         }
511         Type *Int32Ty = Type::getInt32Ty(EI.getContext());
512         return ExtractElementInst::Create(
513             Src, ConstantInt::get(Int32Ty, SrcIdx, false));
514       }
515     } else if (auto *CI = dyn_cast<CastInst>(I)) {
516       // Canonicalize extractelement(cast) -> cast(extractelement).
517       // Bitcasts can change the number of vector elements, and they cost
518       // nothing.
519       if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
520         Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
521         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
522       }
523     }
524   }
525   return nullptr;
526 }
527 
528 /// If V is a shuffle of values that ONLY returns elements from either LHS or
529 /// RHS, return the shuffle mask and true. Otherwise, return false.
530 static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
531                                          SmallVectorImpl<int> &Mask) {
532   assert(LHS->getType() == RHS->getType() &&
533          "Invalid CollectSingleShuffleElements");
534   unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
535 
536   if (match(V, m_Undef())) {
537     Mask.assign(NumElts, -1);
538     return true;
539   }
540 
541   if (V == LHS) {
542     for (unsigned i = 0; i != NumElts; ++i)
543       Mask.push_back(i);
544     return true;
545   }
546 
547   if (V == RHS) {
548     for (unsigned i = 0; i != NumElts; ++i)
549       Mask.push_back(i + NumElts);
550     return true;
551   }
552 
553   if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
554     // If this is an insert of an extract from some other vector, include it.
555     Value *VecOp    = IEI->getOperand(0);
556     Value *ScalarOp = IEI->getOperand(1);
557     Value *IdxOp    = IEI->getOperand(2);
558 
559     if (!isa<ConstantInt>(IdxOp))
560       return false;
561     unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
562 
563     if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
564       // We can handle this if the vector we are inserting into is
565       // transitively ok.
566       if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
567         // If so, update the mask to reflect the inserted undef.
568         Mask[InsertedIdx] = -1;
569         return true;
570       }
571     } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
572       if (isa<ConstantInt>(EI->getOperand(1))) {
573         unsigned ExtractedIdx =
574         cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
575         unsigned NumLHSElts =
576             cast<FixedVectorType>(LHS->getType())->getNumElements();
577 
578         // This must be extracting from either LHS or RHS.
579         if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
580           // We can handle this if the vector we are inserting into is
581           // transitively ok.
582           if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
583             // If so, update the mask to reflect the inserted value.
584             if (EI->getOperand(0) == LHS) {
585               Mask[InsertedIdx % NumElts] = ExtractedIdx;
586             } else {
587               assert(EI->getOperand(0) == RHS);
588               Mask[InsertedIdx % NumElts] = ExtractedIdx + NumLHSElts;
589             }
590             return true;
591           }
592         }
593       }
594     }
595   }
596 
597   return false;
598 }
599 
600 /// If we have insertion into a vector that is wider than the vector that we
601 /// are extracting from, try to widen the source vector to allow a single
602 /// shufflevector to replace one or more insert/extract pairs.
603 static void replaceExtractElements(InsertElementInst *InsElt,
604                                    ExtractElementInst *ExtElt,
605                                    InstCombinerImpl &IC) {
606   auto *InsVecType = cast<FixedVectorType>(InsElt->getType());
607   auto *ExtVecType = cast<FixedVectorType>(ExtElt->getVectorOperandType());
608   unsigned NumInsElts = InsVecType->getNumElements();
609   unsigned NumExtElts = ExtVecType->getNumElements();
610 
611   // The inserted-to vector must be wider than the extracted-from vector.
612   if (InsVecType->getElementType() != ExtVecType->getElementType() ||
613       NumExtElts >= NumInsElts)
614     return;
615 
616   // Create a shuffle mask to widen the extended-from vector using poison
617   // values. The mask selects all of the values of the original vector followed
618   // by as many poison values as needed to create a vector of the same length
619   // as the inserted-to vector.
620   SmallVector<int, 16> ExtendMask;
621   for (unsigned i = 0; i < NumExtElts; ++i)
622     ExtendMask.push_back(i);
623   for (unsigned i = NumExtElts; i < NumInsElts; ++i)
624     ExtendMask.push_back(-1);
625 
626   Value *ExtVecOp = ExtElt->getVectorOperand();
627   auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
628   BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
629                                    ? ExtVecOpInst->getParent()
630                                    : ExtElt->getParent();
631 
632   // TODO: This restriction matches the basic block check below when creating
633   // new extractelement instructions. If that limitation is removed, this one
634   // could also be removed. But for now, we just bail out to ensure that we
635   // will replace the extractelement instruction that is feeding our
636   // insertelement instruction. This allows the insertelement to then be
637   // replaced by a shufflevector. If the insertelement is not replaced, we can
638   // induce infinite looping because there's an optimization for extractelement
639   // that will delete our widening shuffle. This would trigger another attempt
640   // here to create that shuffle, and we spin forever.
641   if (InsertionBlock != InsElt->getParent())
642     return;
643 
644   // TODO: This restriction matches the check in visitInsertElementInst() and
645   // prevents an infinite loop caused by not turning the extract/insert pair
646   // into a shuffle. We really should not need either check, but we're lacking
647   // folds for shufflevectors because we're afraid to generate shuffle masks
648   // that the backend can't handle.
649   if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back()))
650     return;
651 
652   auto *WideVec =
653       new ShuffleVectorInst(ExtVecOp, PoisonValue::get(ExtVecType), ExtendMask);
654 
655   // Insert the new shuffle after the vector operand of the extract is defined
656   // (as long as it's not a PHI) or at the start of the basic block of the
657   // extract, so any subsequent extracts in the same basic block can use it.
658   // TODO: Insert before the earliest ExtractElementInst that is replaced.
659   if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
660     WideVec->insertAfter(ExtVecOpInst);
661   else
662     IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt());
663 
664   // Replace extracts from the original narrow vector with extracts from the new
665   // wide vector.
666   for (User *U : ExtVecOp->users()) {
667     ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U);
668     if (!OldExt || OldExt->getParent() != WideVec->getParent())
669       continue;
670     auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
671     NewExt->insertAfter(OldExt);
672     IC.replaceInstUsesWith(*OldExt, NewExt);
673   }
674 }
675 
676 /// We are building a shuffle to create V, which is a sequence of insertelement,
677 /// extractelement pairs. If PermittedRHS is set, then we must either use it or
678 /// not rely on the second vector source. Return a std::pair containing the
679 /// left and right vectors of the proposed shuffle (or 0), and set the Mask
680 /// parameter as required.
681 ///
682 /// Note: we intentionally don't try to fold earlier shuffles since they have
683 /// often been chosen carefully to be efficiently implementable on the target.
684 using ShuffleOps = std::pair<Value *, Value *>;
685 
686 static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
687                                          Value *PermittedRHS,
688                                          InstCombinerImpl &IC) {
689   assert(V->getType()->isVectorTy() && "Invalid shuffle!");
690   unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
691 
692   if (match(V, m_Undef())) {
693     Mask.assign(NumElts, -1);
694     return std::make_pair(
695         PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr);
696   }
697 
698   if (isa<ConstantAggregateZero>(V)) {
699     Mask.assign(NumElts, 0);
700     return std::make_pair(V, nullptr);
701   }
702 
703   if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
704     // If this is an insert of an extract from some other vector, include it.
705     Value *VecOp    = IEI->getOperand(0);
706     Value *ScalarOp = IEI->getOperand(1);
707     Value *IdxOp    = IEI->getOperand(2);
708 
709     if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
710       if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
711         unsigned ExtractedIdx =
712           cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
713         unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
714 
715         // Either the extracted from or inserted into vector must be RHSVec,
716         // otherwise we'd end up with a shuffle of three inputs.
717         if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
718           Value *RHS = EI->getOperand(0);
719           ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC);
720           assert(LR.second == nullptr || LR.second == RHS);
721 
722           if (LR.first->getType() != RHS->getType()) {
723             // Although we are giving up for now, see if we can create extracts
724             // that match the inserts for another round of combining.
725             replaceExtractElements(IEI, EI, IC);
726 
727             // We tried our best, but we can't find anything compatible with RHS
728             // further up the chain. Return a trivial shuffle.
729             for (unsigned i = 0; i < NumElts; ++i)
730               Mask[i] = i;
731             return std::make_pair(V, nullptr);
732           }
733 
734           unsigned NumLHSElts =
735               cast<FixedVectorType>(RHS->getType())->getNumElements();
736           Mask[InsertedIdx % NumElts] = NumLHSElts + ExtractedIdx;
737           return std::make_pair(LR.first, RHS);
738         }
739 
740         if (VecOp == PermittedRHS) {
741           // We've gone as far as we can: anything on the other side of the
742           // extractelement will already have been converted into a shuffle.
743           unsigned NumLHSElts =
744               cast<FixedVectorType>(EI->getOperand(0)->getType())
745                   ->getNumElements();
746           for (unsigned i = 0; i != NumElts; ++i)
747             Mask.push_back(i == InsertedIdx ? ExtractedIdx : NumLHSElts + i);
748           return std::make_pair(EI->getOperand(0), PermittedRHS);
749         }
750 
751         // If this insertelement is a chain that comes from exactly these two
752         // vectors, return the vector and the effective shuffle.
753         if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&
754             collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
755                                          Mask))
756           return std::make_pair(EI->getOperand(0), PermittedRHS);
757       }
758     }
759   }
760 
761   // Otherwise, we can't do anything fancy. Return an identity vector.
762   for (unsigned i = 0; i != NumElts; ++i)
763     Mask.push_back(i);
764   return std::make_pair(V, nullptr);
765 }
766 
767 /// Look for chain of insertvalue's that fully define an aggregate, and trace
768 /// back the values inserted, see if they are all were extractvalue'd from
769 /// the same source aggregate from the exact same element indexes.
770 /// If they were, just reuse the source aggregate.
771 /// This potentially deals with PHI indirections.
772 Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
773     InsertValueInst &OrigIVI) {
774   Type *AggTy = OrigIVI.getType();
775   unsigned NumAggElts;
776   switch (AggTy->getTypeID()) {
777   case Type::StructTyID:
778     NumAggElts = AggTy->getStructNumElements();
779     break;
780   case Type::ArrayTyID:
781     NumAggElts = AggTy->getArrayNumElements();
782     break;
783   default:
784     llvm_unreachable("Unhandled aggregate type?");
785   }
786 
787   // Arbitrary aggregate size cut-off. Motivation for limit of 2 is to be able
788   // to handle clang C++ exception struct (which is hardcoded as {i8*, i32}),
789   // FIXME: any interesting patterns to be caught with larger limit?
790   assert(NumAggElts > 0 && "Aggregate should have elements.");
791   if (NumAggElts > 2)
792     return nullptr;
793 
794   static constexpr auto NotFound = None;
795   static constexpr auto FoundMismatch = nullptr;
796 
797   // Try to find a value of each element of an aggregate.
798   // FIXME: deal with more complex, not one-dimensional, aggregate types
799   SmallVector<Optional<Instruction *>, 2> AggElts(NumAggElts, NotFound);
800 
801   // Do we know values for each element of the aggregate?
802   auto KnowAllElts = [&AggElts]() {
803     return all_of(AggElts,
804                   [](Optional<Instruction *> Elt) { return Elt != NotFound; });
805   };
806 
807   int Depth = 0;
808 
809   // Arbitrary `insertvalue` visitation depth limit. Let's be okay with
810   // every element being overwritten twice, which should never happen.
811   static const int DepthLimit = 2 * NumAggElts;
812 
813   // Recurse up the chain of `insertvalue` aggregate operands until either we've
814   // reconstructed full initializer or can't visit any more `insertvalue`'s.
815   for (InsertValueInst *CurrIVI = &OrigIVI;
816        Depth < DepthLimit && CurrIVI && !KnowAllElts();
817        CurrIVI = dyn_cast<InsertValueInst>(CurrIVI->getAggregateOperand()),
818                        ++Depth) {
819     auto *InsertedValue =
820         dyn_cast<Instruction>(CurrIVI->getInsertedValueOperand());
821     if (!InsertedValue)
822       return nullptr; // Inserted value must be produced by an instruction.
823 
824     ArrayRef<unsigned int> Indices = CurrIVI->getIndices();
825 
826     // Don't bother with more than single-level aggregates.
827     if (Indices.size() != 1)
828       return nullptr; // FIXME: deal with more complex aggregates?
829 
830     // Now, we may have already previously recorded the value for this element
831     // of an aggregate. If we did, that means the CurrIVI will later be
832     // overwritten with the already-recorded value. But if not, let's record it!
833     Optional<Instruction *> &Elt = AggElts[Indices.front()];
834     Elt = Elt.getValueOr(InsertedValue);
835 
836     // FIXME: should we handle chain-terminating undef base operand?
837   }
838 
839   // Was that sufficient to deduce the full initializer for the aggregate?
840   if (!KnowAllElts())
841     return nullptr; // Give up then.
842 
843   // We now want to find the source[s] of the aggregate elements we've found.
844   // And with "source" we mean the original aggregate[s] from which
845   // the inserted elements were extracted. This may require PHI translation.
846 
847   enum class AggregateDescription {
848     /// When analyzing the value that was inserted into an aggregate, we did
849     /// not manage to find defining `extractvalue` instruction to analyze.
850     NotFound,
851     /// When analyzing the value that was inserted into an aggregate, we did
852     /// manage to find defining `extractvalue` instruction[s], and everything
853     /// matched perfectly - aggregate type, element insertion/extraction index.
854     Found,
855     /// When analyzing the value that was inserted into an aggregate, we did
856     /// manage to find defining `extractvalue` instruction, but there was
857     /// a mismatch: either the source type from which the extraction was didn't
858     /// match the aggregate type into which the insertion was,
859     /// or the extraction/insertion channels mismatched,
860     /// or different elements had different source aggregates.
861     FoundMismatch
862   };
863   auto Describe = [](Optional<Value *> SourceAggregate) {
864     if (SourceAggregate == NotFound)
865       return AggregateDescription::NotFound;
866     if (*SourceAggregate == FoundMismatch)
867       return AggregateDescription::FoundMismatch;
868     return AggregateDescription::Found;
869   };
870 
871   // Given the value \p Elt that was being inserted into element \p EltIdx of an
872   // aggregate AggTy, see if \p Elt was originally defined by an
873   // appropriate extractvalue (same element index, same aggregate type).
874   // If found, return the source aggregate from which the extraction was.
875   // If \p PredBB is provided, does PHI translation of an \p Elt first.
876   auto FindSourceAggregate =
877       [&](Instruction *Elt, unsigned EltIdx, Optional<BasicBlock *> UseBB,
878           Optional<BasicBlock *> PredBB) -> Optional<Value *> {
879     // For now(?), only deal with, at most, a single level of PHI indirection.
880     if (UseBB && PredBB)
881       Elt = dyn_cast<Instruction>(Elt->DoPHITranslation(*UseBB, *PredBB));
882     // FIXME: deal with multiple levels of PHI indirection?
883 
884     // Did we find an extraction?
885     auto *EVI = dyn_cast_or_null<ExtractValueInst>(Elt);
886     if (!EVI)
887       return NotFound;
888 
889     Value *SourceAggregate = EVI->getAggregateOperand();
890 
891     // Is the extraction from the same type into which the insertion was?
892     if (SourceAggregate->getType() != AggTy)
893       return FoundMismatch;
894     // And the element index doesn't change between extraction and insertion?
895     if (EVI->getNumIndices() != 1 || EltIdx != EVI->getIndices().front())
896       return FoundMismatch;
897 
898     return SourceAggregate; // AggregateDescription::Found
899   };
900 
901   // Given elements AggElts that were constructing an aggregate OrigIVI,
902   // see if we can find appropriate source aggregate for each of the elements,
903   // and see it's the same aggregate for each element. If so, return it.
904   auto FindCommonSourceAggregate =
905       [&](Optional<BasicBlock *> UseBB,
906           Optional<BasicBlock *> PredBB) -> Optional<Value *> {
907     Optional<Value *> SourceAggregate;
908 
909     for (auto I : enumerate(AggElts)) {
910       assert(Describe(SourceAggregate) != AggregateDescription::FoundMismatch &&
911              "We don't store nullptr in SourceAggregate!");
912       assert((Describe(SourceAggregate) == AggregateDescription::Found) ==
913                  (I.index() != 0) &&
914              "SourceAggregate should be valid after the the first element,");
915 
916       // For this element, is there a plausible source aggregate?
917       // FIXME: we could special-case undef element, IFF we know that in the
918       //        source aggregate said element isn't poison.
919       Optional<Value *> SourceAggregateForElement =
920           FindSourceAggregate(*I.value(), I.index(), UseBB, PredBB);
921 
922       // Okay, what have we found? Does that correlate with previous findings?
923 
924       // Regardless of whether or not we have previously found source
925       // aggregate for previous elements (if any), if we didn't find one for
926       // this element, passthrough whatever we have just found.
927       if (Describe(SourceAggregateForElement) != AggregateDescription::Found)
928         return SourceAggregateForElement;
929 
930       // Okay, we have found source aggregate for this element.
931       // Let's see what we already know from previous elements, if any.
932       switch (Describe(SourceAggregate)) {
933       case AggregateDescription::NotFound:
934         // This is apparently the first element that we have examined.
935         SourceAggregate = SourceAggregateForElement; // Record the aggregate!
936         continue; // Great, now look at next element.
937       case AggregateDescription::Found:
938         // We have previously already successfully examined other elements.
939         // Is this the same source aggregate we've found for other elements?
940         if (*SourceAggregateForElement != *SourceAggregate)
941           return FoundMismatch;
942         continue; // Still the same aggregate, look at next element.
943       case AggregateDescription::FoundMismatch:
944         llvm_unreachable("Can't happen. We would have early-exited then.");
945       };
946     }
947 
948     assert(Describe(SourceAggregate) == AggregateDescription::Found &&
949            "Must be a valid Value");
950     return *SourceAggregate;
951   };
952 
953   Optional<Value *> SourceAggregate;
954 
955   // Can we find the source aggregate without looking at predecessors?
956   SourceAggregate = FindCommonSourceAggregate(/*UseBB=*/None, /*PredBB=*/None);
957   if (Describe(SourceAggregate) != AggregateDescription::NotFound) {
958     if (Describe(SourceAggregate) == AggregateDescription::FoundMismatch)
959       return nullptr; // Conflicting source aggregates!
960     ++NumAggregateReconstructionsSimplified;
961     return replaceInstUsesWith(OrigIVI, *SourceAggregate);
962   }
963 
964   // Okay, apparently we need to look at predecessors.
965 
966   // We should be smart about picking the "use" basic block, which will be the
967   // merge point for aggregate, where we'll insert the final PHI that will be
968   // used instead of OrigIVI. Basic block of OrigIVI is *not* the right choice.
969   // We should look in which blocks each of the AggElts is being defined,
970   // they all should be defined in the same basic block.
971   BasicBlock *UseBB = nullptr;
972 
973   for (const Optional<Instruction *> &I : AggElts) {
974     BasicBlock *BB = (*I)->getParent();
975     // If it's the first instruction we've encountered, record the basic block.
976     if (!UseBB) {
977       UseBB = BB;
978       continue;
979     }
980     // Otherwise, this must be the same basic block we've seen previously.
981     if (UseBB != BB)
982       return nullptr;
983   }
984 
985   // If *all* of the elements are basic-block-independent, meaning they are
986   // either function arguments, or constant expressions, then if we didn't
987   // handle them without predecessor-aware handling, we won't handle them now.
988   if (!UseBB)
989     return nullptr;
990 
991   // If we didn't manage to find source aggregate without looking at
992   // predecessors, and there are no predecessors to look at, then we're done.
993   if (pred_empty(UseBB))
994     return nullptr;
995 
996   // Arbitrary predecessor count limit.
997   static const int PredCountLimit = 64;
998 
999   // Cache the (non-uniqified!) list of predecessors in a vector,
1000   // checking the limit at the same time for efficiency.
1001   SmallVector<BasicBlock *, 4> Preds; // May have duplicates!
1002   for (BasicBlock *Pred : predecessors(UseBB)) {
1003     // Don't bother if there are too many predecessors.
1004     if (Preds.size() >= PredCountLimit) // FIXME: only count duplicates once?
1005       return nullptr;
1006     Preds.emplace_back(Pred);
1007   }
1008 
1009   // For each predecessor, what is the source aggregate,
1010   // from which all the elements were originally extracted from?
1011   // Note that we want for the map to have stable iteration order!
1012   SmallDenseMap<BasicBlock *, Value *, 4> SourceAggregates;
1013   for (BasicBlock *Pred : Preds) {
1014     std::pair<decltype(SourceAggregates)::iterator, bool> IV =
1015         SourceAggregates.insert({Pred, nullptr});
1016     // Did we already evaluate this predecessor?
1017     if (!IV.second)
1018       continue;
1019 
1020     // Let's hope that when coming from predecessor Pred, all elements of the
1021     // aggregate produced by OrigIVI must have been originally extracted from
1022     // the same aggregate. Is that so? Can we find said original aggregate?
1023     SourceAggregate = FindCommonSourceAggregate(UseBB, Pred);
1024     if (Describe(SourceAggregate) != AggregateDescription::Found)
1025       return nullptr; // Give up.
1026     IV.first->second = *SourceAggregate;
1027   }
1028 
1029   // All good! Now we just need to thread the source aggregates here.
1030   // Note that we have to insert the new PHI here, ourselves, because we can't
1031   // rely on InstCombinerImpl::run() inserting it into the right basic block.
1032   // Note that the same block can be a predecessor more than once,
1033   // and we need to preserve that invariant for the PHI node.
1034   BuilderTy::InsertPointGuard Guard(Builder);
1035   Builder.SetInsertPoint(UseBB->getFirstNonPHI());
1036   auto *PHI =
1037       Builder.CreatePHI(AggTy, Preds.size(), OrigIVI.getName() + ".merged");
1038   for (BasicBlock *Pred : Preds)
1039     PHI->addIncoming(SourceAggregates[Pred], Pred);
1040 
1041   ++NumAggregateReconstructionsSimplified;
1042   return replaceInstUsesWith(OrigIVI, PHI);
1043 }
1044 
1045 /// Try to find redundant insertvalue instructions, like the following ones:
1046 ///  %0 = insertvalue { i8, i32 } undef, i8 %x, 0
1047 ///  %1 = insertvalue { i8, i32 } %0,    i8 %y, 0
1048 /// Here the second instruction inserts values at the same indices, as the
1049 /// first one, making the first one redundant.
1050 /// It should be transformed to:
1051 ///  %0 = insertvalue { i8, i32 } undef, i8 %y, 0
1052 Instruction *InstCombinerImpl::visitInsertValueInst(InsertValueInst &I) {
1053   bool IsRedundant = false;
1054   ArrayRef<unsigned int> FirstIndices = I.getIndices();
1055 
1056   // If there is a chain of insertvalue instructions (each of them except the
1057   // last one has only one use and it's another insertvalue insn from this
1058   // chain), check if any of the 'children' uses the same indices as the first
1059   // instruction. In this case, the first one is redundant.
1060   Value *V = &I;
1061   unsigned Depth = 0;
1062   while (V->hasOneUse() && Depth < 10) {
1063     User *U = V->user_back();
1064     auto UserInsInst = dyn_cast<InsertValueInst>(U);
1065     if (!UserInsInst || U->getOperand(0) != V)
1066       break;
1067     if (UserInsInst->getIndices() == FirstIndices) {
1068       IsRedundant = true;
1069       break;
1070     }
1071     V = UserInsInst;
1072     Depth++;
1073   }
1074 
1075   if (IsRedundant)
1076     return replaceInstUsesWith(I, I.getOperand(0));
1077 
1078   if (Instruction *NewI = foldAggregateConstructionIntoAggregateReuse(I))
1079     return NewI;
1080 
1081   return nullptr;
1082 }
1083 
1084 static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) {
1085   // Can not analyze scalable type, the number of elements is not a compile-time
1086   // constant.
1087   if (isa<ScalableVectorType>(Shuf.getOperand(0)->getType()))
1088     return false;
1089 
1090   int MaskSize = Shuf.getShuffleMask().size();
1091   int VecSize =
1092       cast<FixedVectorType>(Shuf.getOperand(0)->getType())->getNumElements();
1093 
1094   // A vector select does not change the size of the operands.
1095   if (MaskSize != VecSize)
1096     return false;
1097 
1098   // Each mask element must be undefined or choose a vector element from one of
1099   // the source operands without crossing vector lanes.
1100   for (int i = 0; i != MaskSize; ++i) {
1101     int Elt = Shuf.getMaskValue(i);
1102     if (Elt != -1 && Elt != i && Elt != i + VecSize)
1103       return false;
1104   }
1105 
1106   return true;
1107 }
1108 
1109 /// Turn a chain of inserts that splats a value into an insert + shuffle:
1110 /// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... ->
1111 /// shufflevector(insertelt(X, %k, 0), poison, zero)
1112 static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {
1113   // We are interested in the last insert in a chain. So if this insert has a
1114   // single user and that user is an insert, bail.
1115   if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back()))
1116     return nullptr;
1117 
1118   VectorType *VecTy = InsElt.getType();
1119   // Can not handle scalable type, the number of elements is not a compile-time
1120   // constant.
1121   if (isa<ScalableVectorType>(VecTy))
1122     return nullptr;
1123   unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
1124 
1125   // Do not try to do this for a one-element vector, since that's a nop,
1126   // and will cause an inf-loop.
1127   if (NumElements == 1)
1128     return nullptr;
1129 
1130   Value *SplatVal = InsElt.getOperand(1);
1131   InsertElementInst *CurrIE = &InsElt;
1132   SmallBitVector ElementPresent(NumElements, false);
1133   InsertElementInst *FirstIE = nullptr;
1134 
1135   // Walk the chain backwards, keeping track of which indices we inserted into,
1136   // until we hit something that isn't an insert of the splatted value.
1137   while (CurrIE) {
1138     auto *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2));
1139     if (!Idx || CurrIE->getOperand(1) != SplatVal)
1140       return nullptr;
1141 
1142     auto *NextIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0));
1143     // Check none of the intermediate steps have any additional uses, except
1144     // for the root insertelement instruction, which can be re-used, if it
1145     // inserts at position 0.
1146     if (CurrIE != &InsElt &&
1147         (!CurrIE->hasOneUse() && (NextIE != nullptr || !Idx->isZero())))
1148       return nullptr;
1149 
1150     ElementPresent[Idx->getZExtValue()] = true;
1151     FirstIE = CurrIE;
1152     CurrIE = NextIE;
1153   }
1154 
1155   // If this is just a single insertelement (not a sequence), we are done.
1156   if (FirstIE == &InsElt)
1157     return nullptr;
1158 
1159   // If we are not inserting into an undef vector, make sure we've seen an
1160   // insert into every element.
1161   // TODO: If the base vector is not undef, it might be better to create a splat
1162   //       and then a select-shuffle (blend) with the base vector.
1163   if (!match(FirstIE->getOperand(0), m_Undef()))
1164     if (!ElementPresent.all())
1165       return nullptr;
1166 
1167   // Create the insert + shuffle.
1168   Type *Int32Ty = Type::getInt32Ty(InsElt.getContext());
1169   PoisonValue *PoisonVec = PoisonValue::get(VecTy);
1170   Constant *Zero = ConstantInt::get(Int32Ty, 0);
1171   if (!cast<ConstantInt>(FirstIE->getOperand(2))->isZero())
1172     FirstIE = InsertElementInst::Create(PoisonVec, SplatVal, Zero, "", &InsElt);
1173 
1174   // Splat from element 0, but replace absent elements with undef in the mask.
1175   SmallVector<int, 16> Mask(NumElements, 0);
1176   for (unsigned i = 0; i != NumElements; ++i)
1177     if (!ElementPresent[i])
1178       Mask[i] = -1;
1179 
1180   return new ShuffleVectorInst(FirstIE, PoisonVec, Mask);
1181 }
1182 
1183 /// Try to fold an insert element into an existing splat shuffle by changing
1184 /// the shuffle's mask to include the index of this insert element.
1185 static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
1186   // Check if the vector operand of this insert is a canonical splat shuffle.
1187   auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
1188   if (!Shuf || !Shuf->isZeroEltSplat())
1189     return nullptr;
1190 
1191   // Bail out early if shuffle is scalable type. The number of elements in
1192   // shuffle mask is unknown at compile-time.
1193   if (isa<ScalableVectorType>(Shuf->getType()))
1194     return nullptr;
1195 
1196   // Check for a constant insertion index.
1197   uint64_t IdxC;
1198   if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
1199     return nullptr;
1200 
1201   // Check if the splat shuffle's input is the same as this insert's scalar op.
1202   Value *X = InsElt.getOperand(1);
1203   Value *Op0 = Shuf->getOperand(0);
1204   if (!match(Op0, m_InsertElt(m_Undef(), m_Specific(X), m_ZeroInt())))
1205     return nullptr;
1206 
1207   // Replace the shuffle mask element at the index of this insert with a zero.
1208   // For example:
1209   // inselt (shuf (inselt undef, X, 0), undef, <0,undef,0,undef>), X, 1
1210   //   --> shuf (inselt undef, X, 0), undef, <0,0,0,undef>
1211   unsigned NumMaskElts =
1212       cast<FixedVectorType>(Shuf->getType())->getNumElements();
1213   SmallVector<int, 16> NewMask(NumMaskElts);
1214   for (unsigned i = 0; i != NumMaskElts; ++i)
1215     NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i);
1216 
1217   return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask);
1218 }
1219 
1220 /// Try to fold an extract+insert element into an existing identity shuffle by
1221 /// changing the shuffle's mask to include the index of this insert element.
1222 static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {
1223   // Check if the vector operand of this insert is an identity shuffle.
1224   auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
1225   if (!Shuf || !match(Shuf->getOperand(1), m_Undef()) ||
1226       !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding()))
1227     return nullptr;
1228 
1229   // Bail out early if shuffle is scalable type. The number of elements in
1230   // shuffle mask is unknown at compile-time.
1231   if (isa<ScalableVectorType>(Shuf->getType()))
1232     return nullptr;
1233 
1234   // Check for a constant insertion index.
1235   uint64_t IdxC;
1236   if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
1237     return nullptr;
1238 
1239   // Check if this insert's scalar op is extracted from the identity shuffle's
1240   // input vector.
1241   Value *Scalar = InsElt.getOperand(1);
1242   Value *X = Shuf->getOperand(0);
1243   if (!match(Scalar, m_ExtractElt(m_Specific(X), m_SpecificInt(IdxC))))
1244     return nullptr;
1245 
1246   // Replace the shuffle mask element at the index of this extract+insert with
1247   // that same index value.
1248   // For example:
1249   // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask'
1250   unsigned NumMaskElts =
1251       cast<FixedVectorType>(Shuf->getType())->getNumElements();
1252   SmallVector<int, 16> NewMask(NumMaskElts);
1253   ArrayRef<int> OldMask = Shuf->getShuffleMask();
1254   for (unsigned i = 0; i != NumMaskElts; ++i) {
1255     if (i != IdxC) {
1256       // All mask elements besides the inserted element remain the same.
1257       NewMask[i] = OldMask[i];
1258     } else if (OldMask[i] == (int)IdxC) {
1259       // If the mask element was already set, there's nothing to do
1260       // (demanded elements analysis may unset it later).
1261       return nullptr;
1262     } else {
1263       assert(OldMask[i] == UndefMaskElem &&
1264              "Unexpected shuffle mask element for identity shuffle");
1265       NewMask[i] = IdxC;
1266     }
1267   }
1268 
1269   return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask);
1270 }
1271 
1272 /// If we have an insertelement instruction feeding into another insertelement
1273 /// and the 2nd is inserting a constant into the vector, canonicalize that
1274 /// constant insertion before the insertion of a variable:
1275 ///
1276 /// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 -->
1277 /// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1
1278 ///
1279 /// This has the potential of eliminating the 2nd insertelement instruction
1280 /// via constant folding of the scalar constant into a vector constant.
1281 static Instruction *hoistInsEltConst(InsertElementInst &InsElt2,
1282                                      InstCombiner::BuilderTy &Builder) {
1283   auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0));
1284   if (!InsElt1 || !InsElt1->hasOneUse())
1285     return nullptr;
1286 
1287   Value *X, *Y;
1288   Constant *ScalarC;
1289   ConstantInt *IdxC1, *IdxC2;
1290   if (match(InsElt1->getOperand(0), m_Value(X)) &&
1291       match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) &&
1292       match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) &&
1293       match(InsElt2.getOperand(1), m_Constant(ScalarC)) &&
1294       match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) {
1295     Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2);
1296     return InsertElementInst::Create(NewInsElt1, Y, IdxC1);
1297   }
1298 
1299   return nullptr;
1300 }
1301 
1302 /// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex
1303 /// --> shufflevector X, CVec', Mask'
1304 static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
1305   auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0));
1306   // Bail out if the parent has more than one use. In that case, we'd be
1307   // replacing the insertelt with a shuffle, and that's not a clear win.
1308   if (!Inst || !Inst->hasOneUse())
1309     return nullptr;
1310   if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) {
1311     // The shuffle must have a constant vector operand. The insertelt must have
1312     // a constant scalar being inserted at a constant position in the vector.
1313     Constant *ShufConstVec, *InsEltScalar;
1314     uint64_t InsEltIndex;
1315     if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) ||
1316         !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) ||
1317         !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex)))
1318       return nullptr;
1319 
1320     // Adding an element to an arbitrary shuffle could be expensive, but a
1321     // shuffle that selects elements from vectors without crossing lanes is
1322     // assumed cheap.
1323     // If we're just adding a constant into that shuffle, it will still be
1324     // cheap.
1325     if (!isShuffleEquivalentToSelect(*Shuf))
1326       return nullptr;
1327 
1328     // From the above 'select' check, we know that the mask has the same number
1329     // of elements as the vector input operands. We also know that each constant
1330     // input element is used in its lane and can not be used more than once by
1331     // the shuffle. Therefore, replace the constant in the shuffle's constant
1332     // vector with the insertelt constant. Replace the constant in the shuffle's
1333     // mask vector with the insertelt index plus the length of the vector
1334     // (because the constant vector operand of a shuffle is always the 2nd
1335     // operand).
1336     ArrayRef<int> Mask = Shuf->getShuffleMask();
1337     unsigned NumElts = Mask.size();
1338     SmallVector<Constant *, 16> NewShufElts(NumElts);
1339     SmallVector<int, 16> NewMaskElts(NumElts);
1340     for (unsigned I = 0; I != NumElts; ++I) {
1341       if (I == InsEltIndex) {
1342         NewShufElts[I] = InsEltScalar;
1343         NewMaskElts[I] = InsEltIndex + NumElts;
1344       } else {
1345         // Copy over the existing values.
1346         NewShufElts[I] = ShufConstVec->getAggregateElement(I);
1347         NewMaskElts[I] = Mask[I];
1348       }
1349     }
1350 
1351     // Create new operands for a shuffle that includes the constant of the
1352     // original insertelt. The old shuffle will be dead now.
1353     return new ShuffleVectorInst(Shuf->getOperand(0),
1354                                  ConstantVector::get(NewShufElts), NewMaskElts);
1355   } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) {
1356     // Transform sequences of insertelements ops with constant data/indexes into
1357     // a single shuffle op.
1358     // Can not handle scalable type, the number of elements needed to create
1359     // shuffle mask is not a compile-time constant.
1360     if (isa<ScalableVectorType>(InsElt.getType()))
1361       return nullptr;
1362     unsigned NumElts =
1363         cast<FixedVectorType>(InsElt.getType())->getNumElements();
1364 
1365     uint64_t InsertIdx[2];
1366     Constant *Val[2];
1367     if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) ||
1368         !match(InsElt.getOperand(1), m_Constant(Val[0])) ||
1369         !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) ||
1370         !match(IEI->getOperand(1), m_Constant(Val[1])))
1371       return nullptr;
1372     SmallVector<Constant *, 16> Values(NumElts);
1373     SmallVector<int, 16> Mask(NumElts);
1374     auto ValI = std::begin(Val);
1375     // Generate new constant vector and mask.
1376     // We have 2 values/masks from the insertelements instructions. Insert them
1377     // into new value/mask vectors.
1378     for (uint64_t I : InsertIdx) {
1379       if (!Values[I]) {
1380         Values[I] = *ValI;
1381         Mask[I] = NumElts + I;
1382       }
1383       ++ValI;
1384     }
1385     // Remaining values are filled with 'undef' values.
1386     for (unsigned I = 0; I < NumElts; ++I) {
1387       if (!Values[I]) {
1388         Values[I] = UndefValue::get(InsElt.getType()->getElementType());
1389         Mask[I] = I;
1390       }
1391     }
1392     // Create new operands for a shuffle that includes the constant of the
1393     // original insertelt.
1394     return new ShuffleVectorInst(IEI->getOperand(0),
1395                                  ConstantVector::get(Values), Mask);
1396   }
1397   return nullptr;
1398 }
1399 
1400 Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
1401   Value *VecOp    = IE.getOperand(0);
1402   Value *ScalarOp = IE.getOperand(1);
1403   Value *IdxOp    = IE.getOperand(2);
1404 
1405   if (auto *V = SimplifyInsertElementInst(
1406           VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE)))
1407     return replaceInstUsesWith(IE, V);
1408 
1409   // If the scalar is bitcast and inserted into undef, do the insert in the
1410   // source type followed by bitcast.
1411   // TODO: Generalize for insert into any constant, not just undef?
1412   Value *ScalarSrc;
1413   if (match(VecOp, m_Undef()) &&
1414       match(ScalarOp, m_OneUse(m_BitCast(m_Value(ScalarSrc)))) &&
1415       (ScalarSrc->getType()->isIntegerTy() ||
1416        ScalarSrc->getType()->isFloatingPointTy())) {
1417     // inselt undef, (bitcast ScalarSrc), IdxOp -->
1418     //   bitcast (inselt undef, ScalarSrc, IdxOp)
1419     Type *ScalarTy = ScalarSrc->getType();
1420     Type *VecTy = VectorType::get(ScalarTy, IE.getType()->getElementCount());
1421     UndefValue *NewUndef = UndefValue::get(VecTy);
1422     Value *NewInsElt = Builder.CreateInsertElement(NewUndef, ScalarSrc, IdxOp);
1423     return new BitCastInst(NewInsElt, IE.getType());
1424   }
1425 
1426   // If the vector and scalar are both bitcast from the same element type, do
1427   // the insert in that source type followed by bitcast.
1428   Value *VecSrc;
1429   if (match(VecOp, m_BitCast(m_Value(VecSrc))) &&
1430       match(ScalarOp, m_BitCast(m_Value(ScalarSrc))) &&
1431       (VecOp->hasOneUse() || ScalarOp->hasOneUse()) &&
1432       VecSrc->getType()->isVectorTy() && !ScalarSrc->getType()->isVectorTy() &&
1433       cast<VectorType>(VecSrc->getType())->getElementType() ==
1434           ScalarSrc->getType()) {
1435     // inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp -->
1436     //   bitcast (inselt VecSrc, ScalarSrc, IdxOp)
1437     Value *NewInsElt = Builder.CreateInsertElement(VecSrc, ScalarSrc, IdxOp);
1438     return new BitCastInst(NewInsElt, IE.getType());
1439   }
1440 
1441   // If the inserted element was extracted from some other fixed-length vector
1442   // and both indexes are valid constants, try to turn this into a shuffle.
1443   // Can not handle scalable vector type, the number of elements needed to
1444   // create shuffle mask is not a compile-time constant.
1445   uint64_t InsertedIdx, ExtractedIdx;
1446   Value *ExtVecOp;
1447   if (isa<FixedVectorType>(IE.getType()) &&
1448       match(IdxOp, m_ConstantInt(InsertedIdx)) &&
1449       match(ScalarOp,
1450             m_ExtractElt(m_Value(ExtVecOp), m_ConstantInt(ExtractedIdx))) &&
1451       isa<FixedVectorType>(ExtVecOp->getType()) &&
1452       ExtractedIdx <
1453           cast<FixedVectorType>(ExtVecOp->getType())->getNumElements()) {
1454     // TODO: Looking at the user(s) to determine if this insert is a
1455     // fold-to-shuffle opportunity does not match the usual instcombine
1456     // constraints. We should decide if the transform is worthy based only
1457     // on this instruction and its operands, but that may not work currently.
1458     //
1459     // Here, we are trying to avoid creating shuffles before reaching
1460     // the end of a chain of extract-insert pairs. This is complicated because
1461     // we do not generally form arbitrary shuffle masks in instcombine
1462     // (because those may codegen poorly), but collectShuffleElements() does
1463     // exactly that.
1464     //
1465     // The rules for determining what is an acceptable target-independent
1466     // shuffle mask are fuzzy because they evolve based on the backend's
1467     // capabilities and real-world impact.
1468     auto isShuffleRootCandidate = [](InsertElementInst &Insert) {
1469       if (!Insert.hasOneUse())
1470         return true;
1471       auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back());
1472       if (!InsertUser)
1473         return true;
1474       return false;
1475     };
1476 
1477     // Try to form a shuffle from a chain of extract-insert ops.
1478     if (isShuffleRootCandidate(IE)) {
1479       SmallVector<int, 16> Mask;
1480       ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
1481 
1482       // The proposed shuffle may be trivial, in which case we shouldn't
1483       // perform the combine.
1484       if (LR.first != &IE && LR.second != &IE) {
1485         // We now have a shuffle of LHS, RHS, Mask.
1486         if (LR.second == nullptr)
1487           LR.second = UndefValue::get(LR.first->getType());
1488         return new ShuffleVectorInst(LR.first, LR.second, Mask);
1489       }
1490     }
1491   }
1492 
1493   if (auto VecTy = dyn_cast<FixedVectorType>(VecOp->getType())) {
1494     unsigned VWidth = VecTy->getNumElements();
1495     APInt UndefElts(VWidth, 0);
1496     APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
1497     if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
1498       if (V != &IE)
1499         return replaceInstUsesWith(IE, V);
1500       return &IE;
1501     }
1502   }
1503 
1504   if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
1505     return Shuf;
1506 
1507   if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder))
1508     return NewInsElt;
1509 
1510   if (Instruction *Broadcast = foldInsSequenceIntoSplat(IE))
1511     return Broadcast;
1512 
1513   if (Instruction *Splat = foldInsEltIntoSplat(IE))
1514     return Splat;
1515 
1516   if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE))
1517     return IdentityShuf;
1518 
1519   return nullptr;
1520 }
1521 
1522 /// Return true if we can evaluate the specified expression tree if the vector
1523 /// elements were shuffled in a different order.
1524 static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
1525                                 unsigned Depth = 5) {
1526   // We can always reorder the elements of a constant.
1527   if (isa<Constant>(V))
1528     return true;
1529 
1530   // We won't reorder vector arguments. No IPO here.
1531   Instruction *I = dyn_cast<Instruction>(V);
1532   if (!I) return false;
1533 
1534   // Two users may expect different orders of the elements. Don't try it.
1535   if (!I->hasOneUse())
1536     return false;
1537 
1538   if (Depth == 0) return false;
1539 
1540   switch (I->getOpcode()) {
1541     case Instruction::UDiv:
1542     case Instruction::SDiv:
1543     case Instruction::URem:
1544     case Instruction::SRem:
1545       // Propagating an undefined shuffle mask element to integer div/rem is not
1546       // allowed because those opcodes can create immediate undefined behavior
1547       // from an undefined element in an operand.
1548       if (llvm::is_contained(Mask, -1))
1549         return false;
1550       LLVM_FALLTHROUGH;
1551     case Instruction::Add:
1552     case Instruction::FAdd:
1553     case Instruction::Sub:
1554     case Instruction::FSub:
1555     case Instruction::Mul:
1556     case Instruction::FMul:
1557     case Instruction::FDiv:
1558     case Instruction::FRem:
1559     case Instruction::Shl:
1560     case Instruction::LShr:
1561     case Instruction::AShr:
1562     case Instruction::And:
1563     case Instruction::Or:
1564     case Instruction::Xor:
1565     case Instruction::ICmp:
1566     case Instruction::FCmp:
1567     case Instruction::Trunc:
1568     case Instruction::ZExt:
1569     case Instruction::SExt:
1570     case Instruction::FPToUI:
1571     case Instruction::FPToSI:
1572     case Instruction::UIToFP:
1573     case Instruction::SIToFP:
1574     case Instruction::FPTrunc:
1575     case Instruction::FPExt:
1576     case Instruction::GetElementPtr: {
1577       // Bail out if we would create longer vector ops. We could allow creating
1578       // longer vector ops, but that may result in more expensive codegen.
1579       Type *ITy = I->getType();
1580       if (ITy->isVectorTy() &&
1581           Mask.size() > cast<FixedVectorType>(ITy)->getNumElements())
1582         return false;
1583       for (Value *Operand : I->operands()) {
1584         if (!canEvaluateShuffled(Operand, Mask, Depth - 1))
1585           return false;
1586       }
1587       return true;
1588     }
1589     case Instruction::InsertElement: {
1590       ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
1591       if (!CI) return false;
1592       int ElementNumber = CI->getLimitedValue();
1593 
1594       // Verify that 'CI' does not occur twice in Mask. A single 'insertelement'
1595       // can't put an element into multiple indices.
1596       bool SeenOnce = false;
1597       for (int i = 0, e = Mask.size(); i != e; ++i) {
1598         if (Mask[i] == ElementNumber) {
1599           if (SeenOnce)
1600             return false;
1601           SeenOnce = true;
1602         }
1603       }
1604       return canEvaluateShuffled(I->getOperand(0), Mask, Depth - 1);
1605     }
1606   }
1607   return false;
1608 }
1609 
1610 /// Rebuild a new instruction just like 'I' but with the new operands given.
1611 /// In the event of type mismatch, the type of the operands is correct.
1612 static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {
1613   // We don't want to use the IRBuilder here because we want the replacement
1614   // instructions to appear next to 'I', not the builder's insertion point.
1615   switch (I->getOpcode()) {
1616     case Instruction::Add:
1617     case Instruction::FAdd:
1618     case Instruction::Sub:
1619     case Instruction::FSub:
1620     case Instruction::Mul:
1621     case Instruction::FMul:
1622     case Instruction::UDiv:
1623     case Instruction::SDiv:
1624     case Instruction::FDiv:
1625     case Instruction::URem:
1626     case Instruction::SRem:
1627     case Instruction::FRem:
1628     case Instruction::Shl:
1629     case Instruction::LShr:
1630     case Instruction::AShr:
1631     case Instruction::And:
1632     case Instruction::Or:
1633     case Instruction::Xor: {
1634       BinaryOperator *BO = cast<BinaryOperator>(I);
1635       assert(NewOps.size() == 2 && "binary operator with #ops != 2");
1636       BinaryOperator *New =
1637           BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(),
1638                                  NewOps[0], NewOps[1], "", BO);
1639       if (isa<OverflowingBinaryOperator>(BO)) {
1640         New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap());
1641         New->setHasNoSignedWrap(BO->hasNoSignedWrap());
1642       }
1643       if (isa<PossiblyExactOperator>(BO)) {
1644         New->setIsExact(BO->isExact());
1645       }
1646       if (isa<FPMathOperator>(BO))
1647         New->copyFastMathFlags(I);
1648       return New;
1649     }
1650     case Instruction::ICmp:
1651       assert(NewOps.size() == 2 && "icmp with #ops != 2");
1652       return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(),
1653                           NewOps[0], NewOps[1]);
1654     case Instruction::FCmp:
1655       assert(NewOps.size() == 2 && "fcmp with #ops != 2");
1656       return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(),
1657                           NewOps[0], NewOps[1]);
1658     case Instruction::Trunc:
1659     case Instruction::ZExt:
1660     case Instruction::SExt:
1661     case Instruction::FPToUI:
1662     case Instruction::FPToSI:
1663     case Instruction::UIToFP:
1664     case Instruction::SIToFP:
1665     case Instruction::FPTrunc:
1666     case Instruction::FPExt: {
1667       // It's possible that the mask has a different number of elements from
1668       // the original cast. We recompute the destination type to match the mask.
1669       Type *DestTy = VectorType::get(
1670           I->getType()->getScalarType(),
1671           cast<VectorType>(NewOps[0]->getType())->getElementCount());
1672       assert(NewOps.size() == 1 && "cast with #ops != 1");
1673       return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy,
1674                               "", I);
1675     }
1676     case Instruction::GetElementPtr: {
1677       Value *Ptr = NewOps[0];
1678       ArrayRef<Value*> Idx = NewOps.slice(1);
1679       GetElementPtrInst *GEP = GetElementPtrInst::Create(
1680           cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I);
1681       GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
1682       return GEP;
1683     }
1684   }
1685   llvm_unreachable("failed to rebuild vector instructions");
1686 }
1687 
1688 static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
1689   // Mask.size() does not need to be equal to the number of vector elements.
1690 
1691   assert(V->getType()->isVectorTy() && "can't reorder non-vector elements");
1692   Type *EltTy = V->getType()->getScalarType();
1693   Type *I32Ty = IntegerType::getInt32Ty(V->getContext());
1694   if (match(V, m_Undef()))
1695     return UndefValue::get(FixedVectorType::get(EltTy, Mask.size()));
1696 
1697   if (isa<ConstantAggregateZero>(V))
1698     return ConstantAggregateZero::get(FixedVectorType::get(EltTy, Mask.size()));
1699 
1700   if (Constant *C = dyn_cast<Constant>(V))
1701     return ConstantExpr::getShuffleVector(C, PoisonValue::get(C->getType()),
1702                                           Mask);
1703 
1704   Instruction *I = cast<Instruction>(V);
1705   switch (I->getOpcode()) {
1706     case Instruction::Add:
1707     case Instruction::FAdd:
1708     case Instruction::Sub:
1709     case Instruction::FSub:
1710     case Instruction::Mul:
1711     case Instruction::FMul:
1712     case Instruction::UDiv:
1713     case Instruction::SDiv:
1714     case Instruction::FDiv:
1715     case Instruction::URem:
1716     case Instruction::SRem:
1717     case Instruction::FRem:
1718     case Instruction::Shl:
1719     case Instruction::LShr:
1720     case Instruction::AShr:
1721     case Instruction::And:
1722     case Instruction::Or:
1723     case Instruction::Xor:
1724     case Instruction::ICmp:
1725     case Instruction::FCmp:
1726     case Instruction::Trunc:
1727     case Instruction::ZExt:
1728     case Instruction::SExt:
1729     case Instruction::FPToUI:
1730     case Instruction::FPToSI:
1731     case Instruction::UIToFP:
1732     case Instruction::SIToFP:
1733     case Instruction::FPTrunc:
1734     case Instruction::FPExt:
1735     case Instruction::Select:
1736     case Instruction::GetElementPtr: {
1737       SmallVector<Value*, 8> NewOps;
1738       bool NeedsRebuild =
1739           (Mask.size() !=
1740            cast<FixedVectorType>(I->getType())->getNumElements());
1741       for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
1742         Value *V;
1743         // Recursively call evaluateInDifferentElementOrder on vector arguments
1744         // as well. E.g. GetElementPtr may have scalar operands even if the
1745         // return value is a vector, so we need to examine the operand type.
1746         if (I->getOperand(i)->getType()->isVectorTy())
1747           V = evaluateInDifferentElementOrder(I->getOperand(i), Mask);
1748         else
1749           V = I->getOperand(i);
1750         NewOps.push_back(V);
1751         NeedsRebuild |= (V != I->getOperand(i));
1752       }
1753       if (NeedsRebuild) {
1754         return buildNew(I, NewOps);
1755       }
1756       return I;
1757     }
1758     case Instruction::InsertElement: {
1759       int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue();
1760 
1761       // The insertelement was inserting at Element. Figure out which element
1762       // that becomes after shuffling. The answer is guaranteed to be unique
1763       // by CanEvaluateShuffled.
1764       bool Found = false;
1765       int Index = 0;
1766       for (int e = Mask.size(); Index != e; ++Index) {
1767         if (Mask[Index] == Element) {
1768           Found = true;
1769           break;
1770         }
1771       }
1772 
1773       // If element is not in Mask, no need to handle the operand 1 (element to
1774       // be inserted). Just evaluate values in operand 0 according to Mask.
1775       if (!Found)
1776         return evaluateInDifferentElementOrder(I->getOperand(0), Mask);
1777 
1778       Value *V = evaluateInDifferentElementOrder(I->getOperand(0), Mask);
1779       return InsertElementInst::Create(V, I->getOperand(1),
1780                                        ConstantInt::get(I32Ty, Index), "", I);
1781     }
1782   }
1783   llvm_unreachable("failed to reorder elements of vector instruction!");
1784 }
1785 
1786 // Returns true if the shuffle is extracting a contiguous range of values from
1787 // LHS, for example:
1788 //                 +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
1789 //   Input:        |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
1790 //   Shuffles to:  |EE|FF|GG|HH|
1791 //                 +--+--+--+--+
1792 static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
1793                                        ArrayRef<int> Mask) {
1794   unsigned LHSElems =
1795       cast<FixedVectorType>(SVI.getOperand(0)->getType())->getNumElements();
1796   unsigned MaskElems = Mask.size();
1797   unsigned BegIdx = Mask.front();
1798   unsigned EndIdx = Mask.back();
1799   if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
1800     return false;
1801   for (unsigned I = 0; I != MaskElems; ++I)
1802     if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
1803       return false;
1804   return true;
1805 }
1806 
1807 /// These are the ingredients in an alternate form binary operator as described
1808 /// below.
1809 struct BinopElts {
1810   BinaryOperator::BinaryOps Opcode;
1811   Value *Op0;
1812   Value *Op1;
1813   BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0,
1814             Value *V0 = nullptr, Value *V1 = nullptr) :
1815       Opcode(Opc), Op0(V0), Op1(V1) {}
1816   operator bool() const { return Opcode != 0; }
1817 };
1818 
1819 /// Binops may be transformed into binops with different opcodes and operands.
1820 /// Reverse the usual canonicalization to enable folds with the non-canonical
1821 /// form of the binop. If a transform is possible, return the elements of the
1822 /// new binop. If not, return invalid elements.
1823 static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) {
1824   Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1);
1825   Type *Ty = BO->getType();
1826   switch (BO->getOpcode()) {
1827     case Instruction::Shl: {
1828       // shl X, C --> mul X, (1 << C)
1829       Constant *C;
1830       if (match(BO1, m_Constant(C))) {
1831         Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C);
1832         return { Instruction::Mul, BO0, ShlOne };
1833       }
1834       break;
1835     }
1836     case Instruction::Or: {
1837       // or X, C --> add X, C (when X and C have no common bits set)
1838       const APInt *C;
1839       if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL))
1840         return { Instruction::Add, BO0, BO1 };
1841       break;
1842     }
1843     default:
1844       break;
1845   }
1846   return {};
1847 }
1848 
1849 static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
1850   assert(Shuf.isSelect() && "Must have select-equivalent shuffle");
1851 
1852   // Are we shuffling together some value and that same value after it has been
1853   // modified by a binop with a constant?
1854   Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
1855   Constant *C;
1856   bool Op0IsBinop;
1857   if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C))))
1858     Op0IsBinop = true;
1859   else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C))))
1860     Op0IsBinop = false;
1861   else
1862     return nullptr;
1863 
1864   // The identity constant for a binop leaves a variable operand unchanged. For
1865   // a vector, this is a splat of something like 0, -1, or 1.
1866   // If there's no identity constant for this binop, we're done.
1867   auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1);
1868   BinaryOperator::BinaryOps BOpcode = BO->getOpcode();
1869   Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true);
1870   if (!IdC)
1871     return nullptr;
1872 
1873   // Shuffle identity constants into the lanes that return the original value.
1874   // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4}
1875   // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4}
1876   // The existing binop constant vector remains in the same operand position.
1877   ArrayRef<int> Mask = Shuf.getShuffleMask();
1878   Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) :
1879                                 ConstantExpr::getShuffleVector(IdC, C, Mask);
1880 
1881   bool MightCreatePoisonOrUB =
1882       is_contained(Mask, UndefMaskElem) &&
1883       (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
1884   if (MightCreatePoisonOrUB)
1885     NewC = InstCombiner::getSafeVectorConstantForBinop(BOpcode, NewC, true);
1886 
1887   // shuf (bop X, C), X, M --> bop X, C'
1888   // shuf X, (bop X, C), M --> bop X, C'
1889   Value *X = Op0IsBinop ? Op1 : Op0;
1890   Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC);
1891   NewBO->copyIRFlags(BO);
1892 
1893   // An undef shuffle mask element may propagate as an undef constant element in
1894   // the new binop. That would produce poison where the original code might not.
1895   // If we already made a safe constant, then there's no danger.
1896   if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
1897     NewBO->dropPoisonGeneratingFlags();
1898   return NewBO;
1899 }
1900 
1901 /// If we have an insert of a scalar to a non-zero element of an undefined
1902 /// vector and then shuffle that value, that's the same as inserting to the zero
1903 /// element and shuffling. Splatting from the zero element is recognized as the
1904 /// canonical form of splat.
1905 static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
1906                                             InstCombiner::BuilderTy &Builder) {
1907   Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
1908   ArrayRef<int> Mask = Shuf.getShuffleMask();
1909   Value *X;
1910   uint64_t IndexC;
1911 
1912   // Match a shuffle that is a splat to a non-zero element.
1913   if (!match(Op0, m_OneUse(m_InsertElt(m_Undef(), m_Value(X),
1914                                        m_ConstantInt(IndexC)))) ||
1915       !match(Op1, m_Undef()) || match(Mask, m_ZeroMask()) || IndexC == 0)
1916     return nullptr;
1917 
1918   // Insert into element 0 of an undef vector.
1919   UndefValue *UndefVec = UndefValue::get(Shuf.getType());
1920   Constant *Zero = Builder.getInt32(0);
1921   Value *NewIns = Builder.CreateInsertElement(UndefVec, X, Zero);
1922 
1923   // Splat from element 0. Any mask element that is undefined remains undefined.
1924   // For example:
1925   // shuf (inselt undef, X, 2), undef, <2,2,undef>
1926   //   --> shuf (inselt undef, X, 0), undef, <0,0,undef>
1927   unsigned NumMaskElts =
1928       cast<FixedVectorType>(Shuf.getType())->getNumElements();
1929   SmallVector<int, 16> NewMask(NumMaskElts, 0);
1930   for (unsigned i = 0; i != NumMaskElts; ++i)
1931     if (Mask[i] == UndefMaskElem)
1932       NewMask[i] = Mask[i];
1933 
1934   return new ShuffleVectorInst(NewIns, UndefVec, NewMask);
1935 }
1936 
1937 /// Try to fold shuffles that are the equivalent of a vector select.
1938 static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
1939                                       InstCombiner::BuilderTy &Builder,
1940                                       const DataLayout &DL) {
1941   if (!Shuf.isSelect())
1942     return nullptr;
1943 
1944   // Canonicalize to choose from operand 0 first unless operand 1 is undefined.
1945   // Commuting undef to operand 0 conflicts with another canonicalization.
1946   unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();
1947   if (!match(Shuf.getOperand(1), m_Undef()) &&
1948       Shuf.getMaskValue(0) >= (int)NumElts) {
1949     // TODO: Can we assert that both operands of a shuffle-select are not undef
1950     // (otherwise, it would have been folded by instsimplify?
1951     Shuf.commute();
1952     return &Shuf;
1953   }
1954 
1955   if (Instruction *I = foldSelectShuffleWith1Binop(Shuf))
1956     return I;
1957 
1958   BinaryOperator *B0, *B1;
1959   if (!match(Shuf.getOperand(0), m_BinOp(B0)) ||
1960       !match(Shuf.getOperand(1), m_BinOp(B1)))
1961     return nullptr;
1962 
1963   Value *X, *Y;
1964   Constant *C0, *C1;
1965   bool ConstantsAreOp1;
1966   if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) &&
1967       match(B1, m_BinOp(m_Value(Y), m_Constant(C1))))
1968     ConstantsAreOp1 = true;
1969   else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) &&
1970            match(B1, m_BinOp(m_Constant(C1), m_Value(Y))))
1971     ConstantsAreOp1 = false;
1972   else
1973     return nullptr;
1974 
1975   // We need matching binops to fold the lanes together.
1976   BinaryOperator::BinaryOps Opc0 = B0->getOpcode();
1977   BinaryOperator::BinaryOps Opc1 = B1->getOpcode();
1978   bool DropNSW = false;
1979   if (ConstantsAreOp1 && Opc0 != Opc1) {
1980     // TODO: We drop "nsw" if shift is converted into multiply because it may
1981     // not be correct when the shift amount is BitWidth - 1. We could examine
1982     // each vector element to determine if it is safe to keep that flag.
1983     if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl)
1984       DropNSW = true;
1985     if (BinopElts AltB0 = getAlternateBinop(B0, DL)) {
1986       assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop");
1987       Opc0 = AltB0.Opcode;
1988       C0 = cast<Constant>(AltB0.Op1);
1989     } else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) {
1990       assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop");
1991       Opc1 = AltB1.Opcode;
1992       C1 = cast<Constant>(AltB1.Op1);
1993     }
1994   }
1995 
1996   if (Opc0 != Opc1)
1997     return nullptr;
1998 
1999   // The opcodes must be the same. Use a new name to make that clear.
2000   BinaryOperator::BinaryOps BOpc = Opc0;
2001 
2002   // Select the constant elements needed for the single binop.
2003   ArrayRef<int> Mask = Shuf.getShuffleMask();
2004   Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask);
2005 
2006   // We are moving a binop after a shuffle. When a shuffle has an undefined
2007   // mask element, the result is undefined, but it is not poison or undefined
2008   // behavior. That is not necessarily true for div/rem/shift.
2009   bool MightCreatePoisonOrUB =
2010       is_contained(Mask, UndefMaskElem) &&
2011       (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
2012   if (MightCreatePoisonOrUB)
2013     NewC = InstCombiner::getSafeVectorConstantForBinop(BOpc, NewC,
2014                                                        ConstantsAreOp1);
2015 
2016   Value *V;
2017   if (X == Y) {
2018     // Remove a binop and the shuffle by rearranging the constant:
2019     // shuffle (op V, C0), (op V, C1), M --> op V, C'
2020     // shuffle (op C0, V), (op C1, V), M --> op C', V
2021     V = X;
2022   } else {
2023     // If there are 2 different variable operands, we must create a new shuffle
2024     // (select) first, so check uses to ensure that we don't end up with more
2025     // instructions than we started with.
2026     if (!B0->hasOneUse() && !B1->hasOneUse())
2027       return nullptr;
2028 
2029     // If we use the original shuffle mask and op1 is *variable*, we would be
2030     // putting an undef into operand 1 of div/rem/shift. This is either UB or
2031     // poison. We do not have to guard against UB when *constants* are op1
2032     // because safe constants guarantee that we do not overflow sdiv/srem (and
2033     // there's no danger for other opcodes).
2034     // TODO: To allow this case, create a new shuffle mask with no undefs.
2035     if (MightCreatePoisonOrUB && !ConstantsAreOp1)
2036       return nullptr;
2037 
2038     // Note: In general, we do not create new shuffles in InstCombine because we
2039     // do not know if a target can lower an arbitrary shuffle optimally. In this
2040     // case, the shuffle uses the existing mask, so there is no additional risk.
2041 
2042     // Select the variable vectors first, then perform the binop:
2043     // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C'
2044     // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M)
2045     V = Builder.CreateShuffleVector(X, Y, Mask);
2046   }
2047 
2048   Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) :
2049                                          BinaryOperator::Create(BOpc, NewC, V);
2050 
2051   // Flags are intersected from the 2 source binops. But there are 2 exceptions:
2052   // 1. If we changed an opcode, poison conditions might have changed.
2053   // 2. If the shuffle had undef mask elements, the new binop might have undefs
2054   //    where the original code did not. But if we already made a safe constant,
2055   //    then there's no danger.
2056   NewBO->copyIRFlags(B0);
2057   NewBO->andIRFlags(B1);
2058   if (DropNSW)
2059     NewBO->setHasNoSignedWrap(false);
2060   if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
2061     NewBO->dropPoisonGeneratingFlags();
2062   return NewBO;
2063 }
2064 
2065 /// Convert a narrowing shuffle of a bitcasted vector into a vector truncate.
2066 /// Example (little endian):
2067 /// shuf (bitcast <4 x i16> X to <8 x i8>), <0, 2, 4, 6> --> trunc X to <4 x i8>
2068 static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf,
2069                                      bool IsBigEndian) {
2070   // This must be a bitcasted shuffle of 1 vector integer operand.
2071   Type *DestType = Shuf.getType();
2072   Value *X;
2073   if (!match(Shuf.getOperand(0), m_BitCast(m_Value(X))) ||
2074       !match(Shuf.getOperand(1), m_Undef()) || !DestType->isIntOrIntVectorTy())
2075     return nullptr;
2076 
2077   // The source type must have the same number of elements as the shuffle,
2078   // and the source element type must be larger than the shuffle element type.
2079   Type *SrcType = X->getType();
2080   if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() ||
2081       cast<FixedVectorType>(SrcType)->getNumElements() !=
2082           cast<FixedVectorType>(DestType)->getNumElements() ||
2083       SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0)
2084     return nullptr;
2085 
2086   assert(Shuf.changesLength() && !Shuf.increasesLength() &&
2087          "Expected a shuffle that decreases length");
2088 
2089   // Last, check that the mask chooses the correct low bits for each narrow
2090   // element in the result.
2091   uint64_t TruncRatio =
2092       SrcType->getScalarSizeInBits() / DestType->getScalarSizeInBits();
2093   ArrayRef<int> Mask = Shuf.getShuffleMask();
2094   for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
2095     if (Mask[i] == UndefMaskElem)
2096       continue;
2097     uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio;
2098     assert(LSBIndex <= INT32_MAX && "Overflowed 32-bits");
2099     if (Mask[i] != (int)LSBIndex)
2100       return nullptr;
2101   }
2102 
2103   return new TruncInst(X, DestType);
2104 }
2105 
2106 /// Match a shuffle-select-shuffle pattern where the shuffles are widening and
2107 /// narrowing (concatenating with undef and extracting back to the original
2108 /// length). This allows replacing the wide select with a narrow select.
2109 static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
2110                                        InstCombiner::BuilderTy &Builder) {
2111   // This must be a narrowing identity shuffle. It extracts the 1st N elements
2112   // of the 1st vector operand of a shuffle.
2113   if (!match(Shuf.getOperand(1), m_Undef()) || !Shuf.isIdentityWithExtract())
2114     return nullptr;
2115 
2116   // The vector being shuffled must be a vector select that we can eliminate.
2117   // TODO: The one-use requirement could be eased if X and/or Y are constants.
2118   Value *Cond, *X, *Y;
2119   if (!match(Shuf.getOperand(0),
2120              m_OneUse(m_Select(m_Value(Cond), m_Value(X), m_Value(Y)))))
2121     return nullptr;
2122 
2123   // We need a narrow condition value. It must be extended with undef elements
2124   // and have the same number of elements as this shuffle.
2125   unsigned NarrowNumElts =
2126       cast<FixedVectorType>(Shuf.getType())->getNumElements();
2127   Value *NarrowCond;
2128   if (!match(Cond, m_OneUse(m_Shuffle(m_Value(NarrowCond), m_Undef()))) ||
2129       cast<FixedVectorType>(NarrowCond->getType())->getNumElements() !=
2130           NarrowNumElts ||
2131       !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding())
2132     return nullptr;
2133 
2134   // shuf (sel (shuf NarrowCond, undef, WideMask), X, Y), undef, NarrowMask) -->
2135   // sel NarrowCond, (shuf X, undef, NarrowMask), (shuf Y, undef, NarrowMask)
2136   Value *NarrowX = Builder.CreateShuffleVector(X, Shuf.getShuffleMask());
2137   Value *NarrowY = Builder.CreateShuffleVector(Y, Shuf.getShuffleMask());
2138   return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
2139 }
2140 
2141 /// Try to fold an extract subvector operation.
2142 static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
2143   Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
2144   if (!Shuf.isIdentityWithExtract() || !match(Op1, m_Undef()))
2145     return nullptr;
2146 
2147   // Check if we are extracting all bits of an inserted scalar:
2148   // extract-subvec (bitcast (inselt ?, X, 0) --> bitcast X to subvec type
2149   Value *X;
2150   if (match(Op0, m_BitCast(m_InsertElt(m_Value(), m_Value(X), m_Zero()))) &&
2151       X->getType()->getPrimitiveSizeInBits() ==
2152           Shuf.getType()->getPrimitiveSizeInBits())
2153     return new BitCastInst(X, Shuf.getType());
2154 
2155   // Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask.
2156   Value *Y;
2157   ArrayRef<int> Mask;
2158   if (!match(Op0, m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask))))
2159     return nullptr;
2160 
2161   // Be conservative with shuffle transforms. If we can't kill the 1st shuffle,
2162   // then combining may result in worse codegen.
2163   if (!Op0->hasOneUse())
2164     return nullptr;
2165 
2166   // We are extracting a subvector from a shuffle. Remove excess elements from
2167   // the 1st shuffle mask to eliminate the extract.
2168   //
2169   // This transform is conservatively limited to identity extracts because we do
2170   // not allow arbitrary shuffle mask creation as a target-independent transform
2171   // (because we can't guarantee that will lower efficiently).
2172   //
2173   // If the extracting shuffle has an undef mask element, it transfers to the
2174   // new shuffle mask. Otherwise, copy the original mask element. Example:
2175   //   shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> -->
2176   //   shuf X, Y, <C0, undef, C2, undef>
2177   unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();
2178   SmallVector<int, 16> NewMask(NumElts);
2179   assert(NumElts < Mask.size() &&
2180          "Identity with extract must have less elements than its inputs");
2181 
2182   for (unsigned i = 0; i != NumElts; ++i) {
2183     int ExtractMaskElt = Shuf.getMaskValue(i);
2184     int MaskElt = Mask[i];
2185     NewMask[i] = ExtractMaskElt == UndefMaskElem ? ExtractMaskElt : MaskElt;
2186   }
2187   return new ShuffleVectorInst(X, Y, NewMask);
2188 }
2189 
2190 /// Try to replace a shuffle with an insertelement or try to replace a shuffle
2191 /// operand with the operand of an insertelement.
2192 static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
2193                                           InstCombinerImpl &IC) {
2194   Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
2195   SmallVector<int, 16> Mask;
2196   Shuf.getShuffleMask(Mask);
2197 
2198   // The shuffle must not change vector sizes.
2199   // TODO: This restriction could be removed if the insert has only one use
2200   //       (because the transform would require a new length-changing shuffle).
2201   int NumElts = Mask.size();
2202   if (NumElts != (int)(cast<FixedVectorType>(V0->getType())->getNumElements()))
2203     return nullptr;
2204 
2205   // This is a specialization of a fold in SimplifyDemandedVectorElts. We may
2206   // not be able to handle it there if the insertelement has >1 use.
2207   // If the shuffle has an insertelement operand but does not choose the
2208   // inserted scalar element from that value, then we can replace that shuffle
2209   // operand with the source vector of the insertelement.
2210   Value *X;
2211   uint64_t IdxC;
2212   if (match(V0, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
2213     // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask
2214     if (!is_contained(Mask, (int)IdxC))
2215       return IC.replaceOperand(Shuf, 0, X);
2216   }
2217   if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
2218     // Offset the index constant by the vector width because we are checking for
2219     // accesses to the 2nd vector input of the shuffle.
2220     IdxC += NumElts;
2221     // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask
2222     if (!is_contained(Mask, (int)IdxC))
2223       return IC.replaceOperand(Shuf, 1, X);
2224   }
2225 
2226   // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'
2227   auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {
2228     // We need an insertelement with a constant index.
2229     if (!match(V0, m_InsertElt(m_Value(), m_Value(Scalar),
2230                                m_ConstantInt(IndexC))))
2231       return false;
2232 
2233     // Test the shuffle mask to see if it splices the inserted scalar into the
2234     // operand 1 vector of the shuffle.
2235     int NewInsIndex = -1;
2236     for (int i = 0; i != NumElts; ++i) {
2237       // Ignore undef mask elements.
2238       if (Mask[i] == -1)
2239         continue;
2240 
2241       // The shuffle takes elements of operand 1 without lane changes.
2242       if (Mask[i] == NumElts + i)
2243         continue;
2244 
2245       // The shuffle must choose the inserted scalar exactly once.
2246       if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue())
2247         return false;
2248 
2249       // The shuffle is placing the inserted scalar into element i.
2250       NewInsIndex = i;
2251     }
2252 
2253     assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?");
2254 
2255     // Index is updated to the potentially translated insertion lane.
2256     IndexC = ConstantInt::get(IndexC->getType(), NewInsIndex);
2257     return true;
2258   };
2259 
2260   // If the shuffle is unnecessary, insert the scalar operand directly into
2261   // operand 1 of the shuffle. Example:
2262   // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0
2263   Value *Scalar;
2264   ConstantInt *IndexC;
2265   if (isShufflingScalarIntoOp1(Scalar, IndexC))
2266     return InsertElementInst::Create(V1, Scalar, IndexC);
2267 
2268   // Try again after commuting shuffle. Example:
2269   // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> -->
2270   // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3
2271   std::swap(V0, V1);
2272   ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
2273   if (isShufflingScalarIntoOp1(Scalar, IndexC))
2274     return InsertElementInst::Create(V1, Scalar, IndexC);
2275 
2276   return nullptr;
2277 }
2278 
2279 static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
2280   // Match the operands as identity with padding (also known as concatenation
2281   // with undef) shuffles of the same source type. The backend is expected to
2282   // recreate these concatenations from a shuffle of narrow operands.
2283   auto *Shuffle0 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(0));
2284   auto *Shuffle1 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(1));
2285   if (!Shuffle0 || !Shuffle0->isIdentityWithPadding() ||
2286       !Shuffle1 || !Shuffle1->isIdentityWithPadding())
2287     return nullptr;
2288 
2289   // We limit this transform to power-of-2 types because we expect that the
2290   // backend can convert the simplified IR patterns to identical nodes as the
2291   // original IR.
2292   // TODO: If we can verify the same behavior for arbitrary types, the
2293   //       power-of-2 checks can be removed.
2294   Value *X = Shuffle0->getOperand(0);
2295   Value *Y = Shuffle1->getOperand(0);
2296   if (X->getType() != Y->getType() ||
2297       !isPowerOf2_32(cast<FixedVectorType>(Shuf.getType())->getNumElements()) ||
2298       !isPowerOf2_32(
2299           cast<FixedVectorType>(Shuffle0->getType())->getNumElements()) ||
2300       !isPowerOf2_32(cast<FixedVectorType>(X->getType())->getNumElements()) ||
2301       match(X, m_Undef()) || match(Y, m_Undef()))
2302     return nullptr;
2303   assert(match(Shuffle0->getOperand(1), m_Undef()) &&
2304          match(Shuffle1->getOperand(1), m_Undef()) &&
2305          "Unexpected operand for identity shuffle");
2306 
2307   // This is a shuffle of 2 widening shuffles. We can shuffle the narrow source
2308   // operands directly by adjusting the shuffle mask to account for the narrower
2309   // types:
2310   // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask'
2311   int NarrowElts = cast<FixedVectorType>(X->getType())->getNumElements();
2312   int WideElts = cast<FixedVectorType>(Shuffle0->getType())->getNumElements();
2313   assert(WideElts > NarrowElts && "Unexpected types for identity with padding");
2314 
2315   ArrayRef<int> Mask = Shuf.getShuffleMask();
2316   SmallVector<int, 16> NewMask(Mask.size(), -1);
2317   for (int i = 0, e = Mask.size(); i != e; ++i) {
2318     if (Mask[i] == -1)
2319       continue;
2320 
2321     // If this shuffle is choosing an undef element from 1 of the sources, that
2322     // element is undef.
2323     if (Mask[i] < WideElts) {
2324       if (Shuffle0->getMaskValue(Mask[i]) == -1)
2325         continue;
2326     } else {
2327       if (Shuffle1->getMaskValue(Mask[i] - WideElts) == -1)
2328         continue;
2329     }
2330 
2331     // If this shuffle is choosing from the 1st narrow op, the mask element is
2332     // the same. If this shuffle is choosing from the 2nd narrow op, the mask
2333     // element is offset down to adjust for the narrow vector widths.
2334     if (Mask[i] < WideElts) {
2335       assert(Mask[i] < NarrowElts && "Unexpected shuffle mask");
2336       NewMask[i] = Mask[i];
2337     } else {
2338       assert(Mask[i] < (WideElts + NarrowElts) && "Unexpected shuffle mask");
2339       NewMask[i] = Mask[i] - (WideElts - NarrowElts);
2340     }
2341   }
2342   return new ShuffleVectorInst(X, Y, NewMask);
2343 }
2344 
2345 Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
2346   Value *LHS = SVI.getOperand(0);
2347   Value *RHS = SVI.getOperand(1);
2348   SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);
2349   if (auto *V = SimplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(),
2350                                           SVI.getType(), ShufQuery))
2351     return replaceInstUsesWith(SVI, V);
2352 
2353   // Bail out for scalable vectors
2354   if (isa<ScalableVectorType>(LHS->getType()))
2355     return nullptr;
2356 
2357   unsigned VWidth = cast<FixedVectorType>(SVI.getType())->getNumElements();
2358   unsigned LHSWidth = cast<FixedVectorType>(LHS->getType())->getNumElements();
2359 
2360   // shuffle (bitcast X), (bitcast Y), Mask --> bitcast (shuffle X, Y, Mask)
2361   //
2362   // if X and Y are of the same (vector) type, and the element size is not
2363   // changed by the bitcasts, we can distribute the bitcasts through the
2364   // shuffle, hopefully reducing the number of instructions. We make sure that
2365   // at least one bitcast only has one use, so we don't *increase* the number of
2366   // instructions here.
2367   Value *X, *Y;
2368   if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_BitCast(m_Value(Y))) &&
2369       X->getType()->isVectorTy() && X->getType() == Y->getType() &&
2370       X->getType()->getScalarSizeInBits() ==
2371           SVI.getType()->getScalarSizeInBits() &&
2372       (LHS->hasOneUse() || RHS->hasOneUse())) {
2373     Value *V = Builder.CreateShuffleVector(X, Y, SVI.getShuffleMask(),
2374                                            SVI.getName() + ".uncasted");
2375     return new BitCastInst(V, SVI.getType());
2376   }
2377 
2378   ArrayRef<int> Mask = SVI.getShuffleMask();
2379   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
2380 
2381   // Peek through a bitcasted shuffle operand by scaling the mask. If the
2382   // simulated shuffle can simplify, then this shuffle is unnecessary:
2383   // shuf (bitcast X), undef, Mask --> bitcast X'
2384   // TODO: This could be extended to allow length-changing shuffles.
2385   //       The transform might also be obsoleted if we allowed canonicalization
2386   //       of bitcasted shuffles.
2387   if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) &&
2388       X->getType()->isVectorTy() && VWidth == LHSWidth) {
2389     // Try to create a scaled mask constant.
2390     auto *XType = cast<FixedVectorType>(X->getType());
2391     unsigned XNumElts = XType->getNumElements();
2392     SmallVector<int, 16> ScaledMask;
2393     if (XNumElts >= VWidth) {
2394       assert(XNumElts % VWidth == 0 && "Unexpected vector bitcast");
2395       narrowShuffleMaskElts(XNumElts / VWidth, Mask, ScaledMask);
2396     } else {
2397       assert(VWidth % XNumElts == 0 && "Unexpected vector bitcast");
2398       if (!widenShuffleMaskElts(VWidth / XNumElts, Mask, ScaledMask))
2399         ScaledMask.clear();
2400     }
2401     if (!ScaledMask.empty()) {
2402       // If the shuffled source vector simplifies, cast that value to this
2403       // shuffle's type.
2404       if (auto *V = SimplifyShuffleVectorInst(X, UndefValue::get(XType),
2405                                               ScaledMask, XType, ShufQuery))
2406         return BitCastInst::Create(Instruction::BitCast, V, SVI.getType());
2407     }
2408   }
2409 
2410   // shuffle x, x, mask --> shuffle x, undef, mask'
2411   if (LHS == RHS) {
2412     assert(!match(RHS, m_Undef()) &&
2413            "Shuffle with 2 undef ops not simplified?");
2414     // Remap any references to RHS to use LHS.
2415     SmallVector<int, 16> Elts;
2416     for (unsigned i = 0; i != VWidth; ++i) {
2417       // Propagate undef elements or force mask to LHS.
2418       if (Mask[i] < 0)
2419         Elts.push_back(UndefMaskElem);
2420       else
2421         Elts.push_back(Mask[i] % LHSWidth);
2422     }
2423     return new ShuffleVectorInst(LHS, UndefValue::get(RHS->getType()), Elts);
2424   }
2425 
2426   // shuffle undef, x, mask --> shuffle x, undef, mask'
2427   if (match(LHS, m_Undef())) {
2428     SVI.commute();
2429     return &SVI;
2430   }
2431 
2432   if (Instruction *I = canonicalizeInsertSplat(SVI, Builder))
2433     return I;
2434 
2435   if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
2436     return I;
2437 
2438   if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian()))
2439     return I;
2440 
2441   if (Instruction *I = narrowVectorSelect(SVI, Builder))
2442     return I;
2443 
2444   APInt UndefElts(VWidth, 0);
2445   APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
2446   if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
2447     if (V != &SVI)
2448       return replaceInstUsesWith(SVI, V);
2449     return &SVI;
2450   }
2451 
2452   if (Instruction *I = foldIdentityExtractShuffle(SVI))
2453     return I;
2454 
2455   // These transforms have the potential to lose undef knowledge, so they are
2456   // intentionally placed after SimplifyDemandedVectorElts().
2457   if (Instruction *I = foldShuffleWithInsert(SVI, *this))
2458     return I;
2459   if (Instruction *I = foldIdentityPaddedShuffles(SVI))
2460     return I;
2461 
2462   if (match(RHS, m_Undef()) && canEvaluateShuffled(LHS, Mask)) {
2463     Value *V = evaluateInDifferentElementOrder(LHS, Mask);
2464     return replaceInstUsesWith(SVI, V);
2465   }
2466 
2467   // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
2468   // a non-vector type. We can instead bitcast the original vector followed by
2469   // an extract of the desired element:
2470   //
2471   //   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
2472   //                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2473   //   %1 = bitcast <4 x i8> %sroa to i32
2474   // Becomes:
2475   //   %bc = bitcast <16 x i8> %in to <4 x i32>
2476   //   %ext = extractelement <4 x i32> %bc, i32 0
2477   //
2478   // If the shuffle is extracting a contiguous range of values from the input
2479   // vector then each use which is a bitcast of the extracted size can be
2480   // replaced. This will work if the vector types are compatible, and the begin
2481   // index is aligned to a value in the casted vector type. If the begin index
2482   // isn't aligned then we can shuffle the original vector (keeping the same
2483   // vector type) before extracting.
2484   //
2485   // This code will bail out if the target type is fundamentally incompatible
2486   // with vectors of the source type.
2487   //
2488   // Example of <16 x i8>, target type i32:
2489   // Index range [4,8):         v-----------v Will work.
2490   //                +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
2491   //     <16 x i8>: |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
2492   //     <4 x i32>: |           |           |           |           |
2493   //                +-----------+-----------+-----------+-----------+
2494   // Index range [6,10):              ^-----------^ Needs an extra shuffle.
2495   // Target type i40:           ^--------------^ Won't work, bail.
2496   bool MadeChange = false;
2497   if (isShuffleExtractingFromLHS(SVI, Mask)) {
2498     Value *V = LHS;
2499     unsigned MaskElems = Mask.size();
2500     auto *SrcTy = cast<FixedVectorType>(V->getType());
2501     unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedSize();
2502     unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
2503     assert(SrcElemBitWidth && "vector elements must have a bitwidth");
2504     unsigned SrcNumElems = SrcTy->getNumElements();
2505     SmallVector<BitCastInst *, 8> BCs;
2506     DenseMap<Type *, Value *> NewBCs;
2507     for (User *U : SVI.users())
2508       if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
2509         if (!BC->use_empty())
2510           // Only visit bitcasts that weren't previously handled.
2511           BCs.push_back(BC);
2512     for (BitCastInst *BC : BCs) {
2513       unsigned BegIdx = Mask.front();
2514       Type *TgtTy = BC->getDestTy();
2515       unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
2516       if (!TgtElemBitWidth)
2517         continue;
2518       unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
2519       bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
2520       bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
2521       if (!VecBitWidthsEqual)
2522         continue;
2523       if (!VectorType::isValidElementType(TgtTy))
2524         continue;
2525       auto *CastSrcTy = FixedVectorType::get(TgtTy, TgtNumElems);
2526       if (!BegIsAligned) {
2527         // Shuffle the input so [0,NumElements) contains the output, and
2528         // [NumElems,SrcNumElems) is undef.
2529         SmallVector<int, 16> ShuffleMask(SrcNumElems, -1);
2530         for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
2531           ShuffleMask[I] = Idx;
2532         V = Builder.CreateShuffleVector(V, ShuffleMask,
2533                                         SVI.getName() + ".extract");
2534         BegIdx = 0;
2535       }
2536       unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
2537       assert(SrcElemsPerTgtElem);
2538       BegIdx /= SrcElemsPerTgtElem;
2539       bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
2540       auto *NewBC =
2541           BCAlreadyExists
2542               ? NewBCs[CastSrcTy]
2543               : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
2544       if (!BCAlreadyExists)
2545         NewBCs[CastSrcTy] = NewBC;
2546       auto *Ext = Builder.CreateExtractElement(
2547           NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
2548       // The shufflevector isn't being replaced: the bitcast that used it
2549       // is. InstCombine will visit the newly-created instructions.
2550       replaceInstUsesWith(*BC, Ext);
2551       MadeChange = true;
2552     }
2553   }
2554 
2555   // If the LHS is a shufflevector itself, see if we can combine it with this
2556   // one without producing an unusual shuffle.
2557   // Cases that might be simplified:
2558   // 1.
2559   // x1=shuffle(v1,v2,mask1)
2560   //  x=shuffle(x1,undef,mask)
2561   //        ==>
2562   //  x=shuffle(v1,undef,newMask)
2563   // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1
2564   // 2.
2565   // x1=shuffle(v1,undef,mask1)
2566   //  x=shuffle(x1,x2,mask)
2567   // where v1.size() == mask1.size()
2568   //        ==>
2569   //  x=shuffle(v1,x2,newMask)
2570   // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i]
2571   // 3.
2572   // x2=shuffle(v2,undef,mask2)
2573   //  x=shuffle(x1,x2,mask)
2574   // where v2.size() == mask2.size()
2575   //        ==>
2576   //  x=shuffle(x1,v2,newMask)
2577   // newMask[i] = (mask[i] < x1.size())
2578   //              ? mask[i] : mask2[mask[i]-x1.size()]+x1.size()
2579   // 4.
2580   // x1=shuffle(v1,undef,mask1)
2581   // x2=shuffle(v2,undef,mask2)
2582   //  x=shuffle(x1,x2,mask)
2583   // where v1.size() == v2.size()
2584   //        ==>
2585   //  x=shuffle(v1,v2,newMask)
2586   // newMask[i] = (mask[i] < x1.size())
2587   //              ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size()
2588   //
2589   // Here we are really conservative:
2590   // we are absolutely afraid of producing a shuffle mask not in the input
2591   // program, because the code gen may not be smart enough to turn a merged
2592   // shuffle into two specific shuffles: it may produce worse code.  As such,
2593   // we only merge two shuffles if the result is either a splat or one of the
2594   // input shuffle masks.  In this case, merging the shuffles just removes
2595   // one instruction, which we know is safe.  This is good for things like
2596   // turning: (splat(splat)) -> splat, or
2597   // merge(V[0..n], V[n+1..2n]) -> V[0..2n]
2598   ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS);
2599   ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);
2600   if (LHSShuffle)
2601     if (!match(LHSShuffle->getOperand(1), m_Undef()) && !match(RHS, m_Undef()))
2602       LHSShuffle = nullptr;
2603   if (RHSShuffle)
2604     if (!match(RHSShuffle->getOperand(1), m_Undef()))
2605       RHSShuffle = nullptr;
2606   if (!LHSShuffle && !RHSShuffle)
2607     return MadeChange ? &SVI : nullptr;
2608 
2609   Value* LHSOp0 = nullptr;
2610   Value* LHSOp1 = nullptr;
2611   Value* RHSOp0 = nullptr;
2612   unsigned LHSOp0Width = 0;
2613   unsigned RHSOp0Width = 0;
2614   if (LHSShuffle) {
2615     LHSOp0 = LHSShuffle->getOperand(0);
2616     LHSOp1 = LHSShuffle->getOperand(1);
2617     LHSOp0Width = cast<FixedVectorType>(LHSOp0->getType())->getNumElements();
2618   }
2619   if (RHSShuffle) {
2620     RHSOp0 = RHSShuffle->getOperand(0);
2621     RHSOp0Width = cast<FixedVectorType>(RHSOp0->getType())->getNumElements();
2622   }
2623   Value* newLHS = LHS;
2624   Value* newRHS = RHS;
2625   if (LHSShuffle) {
2626     // case 1
2627     if (match(RHS, m_Undef())) {
2628       newLHS = LHSOp0;
2629       newRHS = LHSOp1;
2630     }
2631     // case 2 or 4
2632     else if (LHSOp0Width == LHSWidth) {
2633       newLHS = LHSOp0;
2634     }
2635   }
2636   // case 3 or 4
2637   if (RHSShuffle && RHSOp0Width == LHSWidth) {
2638     newRHS = RHSOp0;
2639   }
2640   // case 4
2641   if (LHSOp0 == RHSOp0) {
2642     newLHS = LHSOp0;
2643     newRHS = nullptr;
2644   }
2645 
2646   if (newLHS == LHS && newRHS == RHS)
2647     return MadeChange ? &SVI : nullptr;
2648 
2649   ArrayRef<int> LHSMask;
2650   ArrayRef<int> RHSMask;
2651   if (newLHS != LHS)
2652     LHSMask = LHSShuffle->getShuffleMask();
2653   if (RHSShuffle && newRHS != RHS)
2654     RHSMask = RHSShuffle->getShuffleMask();
2655 
2656   unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;
2657   SmallVector<int, 16> newMask;
2658   bool isSplat = true;
2659   int SplatElt = -1;
2660   // Create a new mask for the new ShuffleVectorInst so that the new
2661   // ShuffleVectorInst is equivalent to the original one.
2662   for (unsigned i = 0; i < VWidth; ++i) {
2663     int eltMask;
2664     if (Mask[i] < 0) {
2665       // This element is an undef value.
2666       eltMask = -1;
2667     } else if (Mask[i] < (int)LHSWidth) {
2668       // This element is from left hand side vector operand.
2669       //
2670       // If LHS is going to be replaced (case 1, 2, or 4), calculate the
2671       // new mask value for the element.
2672       if (newLHS != LHS) {
2673         eltMask = LHSMask[Mask[i]];
2674         // If the value selected is an undef value, explicitly specify it
2675         // with a -1 mask value.
2676         if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
2677           eltMask = -1;
2678       } else
2679         eltMask = Mask[i];
2680     } else {
2681       // This element is from right hand side vector operand
2682       //
2683       // If the value selected is an undef value, explicitly specify it
2684       // with a -1 mask value. (case 1)
2685       if (match(RHS, m_Undef()))
2686         eltMask = -1;
2687       // If RHS is going to be replaced (case 3 or 4), calculate the
2688       // new mask value for the element.
2689       else if (newRHS != RHS) {
2690         eltMask = RHSMask[Mask[i]-LHSWidth];
2691         // If the value selected is an undef value, explicitly specify it
2692         // with a -1 mask value.
2693         if (eltMask >= (int)RHSOp0Width) {
2694           assert(match(RHSShuffle->getOperand(1), m_Undef()) &&
2695                  "should have been check above");
2696           eltMask = -1;
2697         }
2698       } else
2699         eltMask = Mask[i]-LHSWidth;
2700 
2701       // If LHS's width is changed, shift the mask value accordingly.
2702       // If newRHS == nullptr, i.e. LHSOp0 == RHSOp0, we want to remap any
2703       // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
2704       // If newRHS == newLHS, we want to remap any references from newRHS to
2705       // newLHS so that we can properly identify splats that may occur due to
2706       // obfuscation across the two vectors.
2707       if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)
2708         eltMask += newLHSWidth;
2709     }
2710 
2711     // Check if this could still be a splat.
2712     if (eltMask >= 0) {
2713       if (SplatElt >= 0 && SplatElt != eltMask)
2714         isSplat = false;
2715       SplatElt = eltMask;
2716     }
2717 
2718     newMask.push_back(eltMask);
2719   }
2720 
2721   // If the result mask is equal to one of the original shuffle masks,
2722   // or is a splat, do the replacement.
2723   if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
2724     if (!newRHS)
2725       newRHS = UndefValue::get(newLHS->getType());
2726     return new ShuffleVectorInst(newLHS, newRHS, newMask);
2727   }
2728 
2729   return MadeChange ? &SVI : nullptr;
2730 }
2731