1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
19 #include "llvm/ADT/Optional.h"
20 #include "llvm/ADT/PostOrderIterator.h"
21 #include "llvm/ADT/SetVector.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/CodeMetrics.h"
24 #include "llvm/Analysis/GlobalsModRef.h"
25 #include "llvm/Analysis/LoopAccessAnalysis.h"
26 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
27 #include "llvm/Analysis/ValueTracking.h"
28 #include "llvm/Analysis/VectorUtils.h"
29 #include "llvm/IR/DataLayout.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/IRBuilder.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/IntrinsicInst.h"
34 #include "llvm/IR/Module.h"
35 #include "llvm/IR/NoFolder.h"
36 #include "llvm/IR/Type.h"
37 #include "llvm/IR/Value.h"
38 #include "llvm/IR/Verifier.h"
39 #include "llvm/Pass.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/GraphWriter.h"
43 #include "llvm/Support/raw_ostream.h"
44 #include "llvm/Transforms/Utils/LoopUtils.h"
45 #include "llvm/Transforms/Vectorize.h"
46 #include <algorithm>
47 #include <memory>
48 
49 using namespace llvm;
50 using namespace slpvectorizer;
51 
52 #define SV_NAME "slp-vectorizer"
53 #define DEBUG_TYPE "SLP"
54 
55 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
56 
57 static cl::opt<int>
58     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
59                      cl::desc("Only vectorize if you gain more than this "
60                               "number "));
61 
62 static cl::opt<bool>
63 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
64                    cl::desc("Attempt to vectorize horizontal reductions"));
65 
66 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
67     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
68     cl::desc(
69         "Attempt to vectorize horizontal reductions feeding into a store"));
70 
71 static cl::opt<int>
72 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
73     cl::desc("Attempt to vectorize for this register size in bits"));
74 
75 /// Limits the size of scheduling regions in a block.
76 /// It avoid long compile times for _very_ large blocks where vector
77 /// instructions are spread over a wide range.
78 /// This limit is way higher than needed by real-world functions.
79 static cl::opt<int>
80 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
81     cl::desc("Limit the size of the SLP scheduling region per block"));
82 
83 static cl::opt<int> MinVectorRegSizeOption(
84     "slp-min-reg-size", cl::init(128), cl::Hidden,
85     cl::desc("Attempt to vectorize for this register size in bits"));
86 
87 static cl::opt<unsigned> RecursionMaxDepth(
88     "slp-recursion-max-depth", cl::init(12), cl::Hidden,
89     cl::desc("Limit the recursion depth when building a vectorizable tree"));
90 
91 static cl::opt<unsigned> MinTreeSize(
92     "slp-min-tree-size", cl::init(3), cl::Hidden,
93     cl::desc("Only vectorize small trees if they are fully vectorizable"));
94 
95 static cl::opt<bool>
96     ViewSLPTree("view-slp-tree", cl::Hidden,
97                 cl::desc("Display the SLP trees with Graphviz"));
98 
99 // Limit the number of alias checks. The limit is chosen so that
100 // it has no negative effect on the llvm benchmarks.
101 static const unsigned AliasedCheckLimit = 10;
102 
103 // Another limit for the alias checks: The maximum distance between load/store
104 // instructions where alias checks are done.
105 // This limit is useful for very large basic blocks.
106 static const unsigned MaxMemDepDistance = 160;
107 
108 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
109 /// regions to be handled.
110 static const int MinScheduleRegionSize = 16;
111 
112 /// \brief Predicate for the element types that the SLP vectorizer supports.
113 ///
114 /// The most important thing to filter here are types which are invalid in LLVM
115 /// vectors. We also filter target specific types which have absolutely no
116 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
117 /// avoids spending time checking the cost model and realizing that they will
118 /// be inevitably scalarized.
119 static bool isValidElementType(Type *Ty) {
120   return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
121          !Ty->isPPC_FP128Ty();
122 }
123 
124 /// \returns true if all of the instructions in \p VL are in the same block or
125 /// false otherwise.
126 static bool allSameBlock(ArrayRef<Value *> VL) {
127   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
128   if (!I0)
129     return false;
130   BasicBlock *BB = I0->getParent();
131   for (int i = 1, e = VL.size(); i < e; i++) {
132     Instruction *I = dyn_cast<Instruction>(VL[i]);
133     if (!I)
134       return false;
135 
136     if (BB != I->getParent())
137       return false;
138   }
139   return true;
140 }
141 
142 /// \returns True if all of the values in \p VL are constants.
143 static bool allConstant(ArrayRef<Value *> VL) {
144   for (Value *i : VL)
145     if (!isa<Constant>(i))
146       return false;
147   return true;
148 }
149 
150 /// \returns True if all of the values in \p VL are identical.
151 static bool isSplat(ArrayRef<Value *> VL) {
152   for (unsigned i = 1, e = VL.size(); i < e; ++i)
153     if (VL[i] != VL[0])
154       return false;
155   return true;
156 }
157 
158 ///\returns Opcode that can be clubbed with \p Op to create an alternate
159 /// sequence which can later be merged as a ShuffleVector instruction.
160 static unsigned getAltOpcode(unsigned Op) {
161   switch (Op) {
162   case Instruction::FAdd:
163     return Instruction::FSub;
164   case Instruction::FSub:
165     return Instruction::FAdd;
166   case Instruction::Add:
167     return Instruction::Sub;
168   case Instruction::Sub:
169     return Instruction::Add;
170   default:
171     return 0;
172   }
173 }
174 
175 ///\returns bool representing if Opcode \p Op can be part
176 /// of an alternate sequence which can later be merged as
177 /// a ShuffleVector instruction.
178 static bool canCombineAsAltInst(unsigned Op) {
179   return Op == Instruction::FAdd || Op == Instruction::FSub ||
180          Op == Instruction::Sub || Op == Instruction::Add;
181 }
182 
183 /// \returns ShuffleVector instruction if instructions in \p VL have
184 ///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
185 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
186 static unsigned isAltInst(ArrayRef<Value *> VL) {
187   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
188   unsigned Opcode = I0->getOpcode();
189   unsigned AltOpcode = getAltOpcode(Opcode);
190   for (int i = 1, e = VL.size(); i < e; i++) {
191     Instruction *I = dyn_cast<Instruction>(VL[i]);
192     if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
193       return 0;
194   }
195   return Instruction::ShuffleVector;
196 }
197 
198 /// \returns The opcode if all of the Instructions in \p VL have the same
199 /// opcode, or zero.
200 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
201   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
202   if (!I0)
203     return 0;
204   unsigned Opcode = I0->getOpcode();
205   for (int i = 1, e = VL.size(); i < e; i++) {
206     Instruction *I = dyn_cast<Instruction>(VL[i]);
207     if (!I || Opcode != I->getOpcode()) {
208       if (canCombineAsAltInst(Opcode) && i == 1)
209         return isAltInst(VL);
210       return 0;
211     }
212   }
213   return Opcode;
214 }
215 
216 /// \returns true if all of the values in \p VL have the same type or false
217 /// otherwise.
218 static bool allSameType(ArrayRef<Value *> VL) {
219   Type *Ty = VL[0]->getType();
220   for (int i = 1, e = VL.size(); i < e; i++)
221     if (VL[i]->getType() != Ty)
222       return false;
223 
224   return true;
225 }
226 
227 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
228 static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
229   assert(Opcode == Instruction::ExtractElement ||
230          Opcode == Instruction::ExtractValue);
231   if (Opcode == Instruction::ExtractElement) {
232     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
233     return CI && CI->getZExtValue() == Idx;
234   } else {
235     ExtractValueInst *EI = cast<ExtractValueInst>(E);
236     return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
237   }
238 }
239 
240 /// \returns True if in-tree use also needs extract. This refers to
241 /// possible scalar operand in vectorized instruction.
242 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
243                                     TargetLibraryInfo *TLI) {
244 
245   unsigned Opcode = UserInst->getOpcode();
246   switch (Opcode) {
247   case Instruction::Load: {
248     LoadInst *LI = cast<LoadInst>(UserInst);
249     return (LI->getPointerOperand() == Scalar);
250   }
251   case Instruction::Store: {
252     StoreInst *SI = cast<StoreInst>(UserInst);
253     return (SI->getPointerOperand() == Scalar);
254   }
255   case Instruction::Call: {
256     CallInst *CI = cast<CallInst>(UserInst);
257     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
258     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
259       return (CI->getArgOperand(1) == Scalar);
260     }
261   }
262   default:
263     return false;
264   }
265 }
266 
267 /// \returns the AA location that is being access by the instruction.
268 static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
269   if (StoreInst *SI = dyn_cast<StoreInst>(I))
270     return MemoryLocation::get(SI);
271   if (LoadInst *LI = dyn_cast<LoadInst>(I))
272     return MemoryLocation::get(LI);
273   return MemoryLocation();
274 }
275 
276 /// \returns True if the instruction is not a volatile or atomic load/store.
277 static bool isSimple(Instruction *I) {
278   if (LoadInst *LI = dyn_cast<LoadInst>(I))
279     return LI->isSimple();
280   if (StoreInst *SI = dyn_cast<StoreInst>(I))
281     return SI->isSimple();
282   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
283     return !MI->isVolatile();
284   return true;
285 }
286 
287 namespace llvm {
288 namespace slpvectorizer {
289 /// Bottom Up SLP Vectorizer.
290 class BoUpSLP {
291 public:
292   typedef SmallVector<Value *, 8> ValueList;
293   typedef SmallVector<Instruction *, 16> InstrList;
294   typedef SmallPtrSet<Value *, 16> ValueSet;
295   typedef SmallVector<StoreInst *, 8> StoreList;
296   typedef MapVector<Value *, SmallVector<Instruction *, 2>>
297       ExtraValueToDebugLocsMap;
298 
299   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
300           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
301           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
302           const DataLayout *DL, OptimizationRemarkEmitter *ORE)
303       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
304         SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
305         DL(DL), ORE(ORE), Builder(Se->getContext()) {
306     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
307     // Use the vector register size specified by the target unless overridden
308     // by a command-line option.
309     // TODO: It would be better to limit the vectorization factor based on
310     //       data type rather than just register size. For example, x86 AVX has
311     //       256-bit registers, but it does not support integer operations
312     //       at that width (that requires AVX2).
313     if (MaxVectorRegSizeOption.getNumOccurrences())
314       MaxVecRegSize = MaxVectorRegSizeOption;
315     else
316       MaxVecRegSize = TTI->getRegisterBitWidth(true);
317 
318     MinVecRegSize = MinVectorRegSizeOption;
319   }
320 
321   /// \brief Vectorize the tree that starts with the elements in \p VL.
322   /// Returns the vectorized root.
323   Value *vectorizeTree();
324   /// Vectorize the tree but with the list of externally used values \p
325   /// ExternallyUsedValues. Values in this MapVector can be replaced but the
326   /// generated extractvalue instructions.
327   Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
328 
329   /// \returns the cost incurred by unwanted spills and fills, caused by
330   /// holding live values over call sites.
331   int getSpillCost();
332 
333   /// \returns the vectorization cost of the subtree that starts at \p VL.
334   /// A negative number means that this is profitable.
335   int getTreeCost();
336 
337   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
338   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
339   void buildTree(ArrayRef<Value *> Roots,
340                  ArrayRef<Value *> UserIgnoreLst = None);
341   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
342   /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
343   /// into account (anf updating it, if required) list of externally used
344   /// values stored in \p ExternallyUsedValues.
345   void buildTree(ArrayRef<Value *> Roots,
346                  ExtraValueToDebugLocsMap &ExternallyUsedValues,
347                  ArrayRef<Value *> UserIgnoreLst = None);
348 
349   /// Clear the internal data structures that are created by 'buildTree'.
350   void deleteTree() {
351     VectorizableTree.clear();
352     ScalarToTreeEntry.clear();
353     MustGather.clear();
354     ExternalUses.clear();
355     NumLoadsWantToKeepOrder = 0;
356     NumLoadsWantToChangeOrder = 0;
357     for (auto &Iter : BlocksSchedules) {
358       BlockScheduling *BS = Iter.second.get();
359       BS->clear();
360     }
361     MinBWs.clear();
362   }
363 
364   unsigned getTreeSize() const { return VectorizableTree.size(); }
365 
366   /// \brief Perform LICM and CSE on the newly generated gather sequences.
367   void optimizeGatherSequence();
368 
369   /// \returns true if it is beneficial to reverse the vector order.
370   bool shouldReorder() const {
371     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
372   }
373 
374   /// \return The vector element size in bits to use when vectorizing the
375   /// expression tree ending at \p V. If V is a store, the size is the width of
376   /// the stored value. Otherwise, the size is the width of the largest loaded
377   /// value reaching V. This method is used by the vectorizer to calculate
378   /// vectorization factors.
379   unsigned getVectorElementSize(Value *V);
380 
381   /// Compute the minimum type sizes required to represent the entries in a
382   /// vectorizable tree.
383   void computeMinimumValueSizes();
384 
385   // \returns maximum vector register size as set by TTI or overridden by cl::opt.
386   unsigned getMaxVecRegSize() const {
387     return MaxVecRegSize;
388   }
389 
390   // \returns minimum vector register size as set by cl::opt.
391   unsigned getMinVecRegSize() const {
392     return MinVecRegSize;
393   }
394 
395   /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
396   ///
397   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
398   unsigned canMapToVector(Type *T, const DataLayout &DL) const;
399 
400   /// \returns True if the VectorizableTree is both tiny and not fully
401   /// vectorizable. We do not vectorize such trees.
402   bool isTreeTinyAndNotFullyVectorizable();
403 
404   OptimizationRemarkEmitter *getORE() { return ORE; }
405 
406 private:
407   struct TreeEntry;
408 
409   /// \returns the cost of the vectorizable entry.
410   int getEntryCost(TreeEntry *E);
411 
412   /// This is the recursive part of buildTree.
413   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
414 
415   /// \returns True if the ExtractElement/ExtractValue instructions in VL can
416   /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
417   bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
418 
419   /// Vectorize a single entry in the tree.
420   Value *vectorizeTree(TreeEntry *E);
421 
422   /// Vectorize a single entry in the tree, starting in \p VL.
423   Value *vectorizeTree(ArrayRef<Value *> VL);
424 
425   /// \returns the pointer to the vectorized value if \p VL is already
426   /// vectorized, or NULL. They may happen in cycles.
427   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
428 
429   /// \returns the scalarization cost for this type. Scalarization in this
430   /// context means the creation of vectors from a group of scalars.
431   int getGatherCost(Type *Ty);
432 
433   /// \returns the scalarization cost for this list of values. Assuming that
434   /// this subtree gets vectorized, we may need to extract the values from the
435   /// roots. This method calculates the cost of extracting the values.
436   int getGatherCost(ArrayRef<Value *> VL);
437 
438   /// \brief Set the Builder insert point to one after the last instruction in
439   /// the bundle
440   void setInsertPointAfterBundle(ArrayRef<Value *> VL);
441 
442   /// \returns a vector from a collection of scalars in \p VL.
443   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
444 
445   /// \returns whether the VectorizableTree is fully vectorizable and will
446   /// be beneficial even the tree height is tiny.
447   bool isFullyVectorizableTinyTree();
448 
449   /// \reorder commutative operands in alt shuffle if they result in
450   ///  vectorized code.
451   void reorderAltShuffleOperands(ArrayRef<Value *> VL,
452                                  SmallVectorImpl<Value *> &Left,
453                                  SmallVectorImpl<Value *> &Right);
454   /// \reorder commutative operands to get better probability of
455   /// generating vectorized code.
456   void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
457                                       SmallVectorImpl<Value *> &Left,
458                                       SmallVectorImpl<Value *> &Right);
459   struct TreeEntry {
460     TreeEntry(std::vector<TreeEntry> &Container)
461         : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
462           Container(Container) {}
463 
464     /// \returns true if the scalars in VL are equal to this entry.
465     bool isSame(ArrayRef<Value *> VL) const {
466       assert(VL.size() == Scalars.size() && "Invalid size");
467       return std::equal(VL.begin(), VL.end(), Scalars.begin());
468     }
469 
470     /// A vector of scalars.
471     ValueList Scalars;
472 
473     /// The Scalars are vectorized into this value. It is initialized to Null.
474     Value *VectorizedValue;
475 
476     /// Do we need to gather this sequence ?
477     bool NeedToGather;
478 
479     /// Points back to the VectorizableTree.
480     ///
481     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
482     /// to be a pointer and needs to be able to initialize the child iterator.
483     /// Thus we need a reference back to the container to translate the indices
484     /// to entries.
485     std::vector<TreeEntry> &Container;
486 
487     /// The TreeEntry index containing the user of this entry.  We can actually
488     /// have multiple users so the data structure is not truly a tree.
489     SmallVector<int, 1> UserTreeIndices;
490   };
491 
492   /// Create a new VectorizableTree entry.
493   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
494                           int &UserTreeIdx) {
495     VectorizableTree.emplace_back(VectorizableTree);
496     int idx = VectorizableTree.size() - 1;
497     TreeEntry *Last = &VectorizableTree[idx];
498     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
499     Last->NeedToGather = !Vectorized;
500     if (Vectorized) {
501       for (int i = 0, e = VL.size(); i != e; ++i) {
502         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
503         ScalarToTreeEntry[VL[i]] = idx;
504       }
505     } else {
506       MustGather.insert(VL.begin(), VL.end());
507     }
508 
509     if (UserTreeIdx >= 0)
510       Last->UserTreeIndices.push_back(UserTreeIdx);
511     UserTreeIdx = idx;
512     return Last;
513   }
514 
515   /// -- Vectorization State --
516   /// Holds all of the tree entries.
517   std::vector<TreeEntry> VectorizableTree;
518 
519   /// Maps a specific scalar to its tree entry.
520   SmallDenseMap<Value*, int> ScalarToTreeEntry;
521 
522   /// A list of scalars that we found that we need to keep as scalars.
523   ValueSet MustGather;
524 
525   /// This POD struct describes one external user in the vectorized tree.
526   struct ExternalUser {
527     ExternalUser (Value *S, llvm::User *U, int L) :
528       Scalar(S), User(U), Lane(L){}
529     // Which scalar in our function.
530     Value *Scalar;
531     // Which user that uses the scalar.
532     llvm::User *User;
533     // Which lane does the scalar belong to.
534     int Lane;
535   };
536   typedef SmallVector<ExternalUser, 16> UserList;
537 
538   /// Checks if two instructions may access the same memory.
539   ///
540   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
541   /// is invariant in the calling loop.
542   bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
543                  Instruction *Inst2) {
544 
545     // First check if the result is already in the cache.
546     AliasCacheKey key = std::make_pair(Inst1, Inst2);
547     Optional<bool> &result = AliasCache[key];
548     if (result.hasValue()) {
549       return result.getValue();
550     }
551     MemoryLocation Loc2 = getLocation(Inst2, AA);
552     bool aliased = true;
553     if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
554       // Do the alias check.
555       aliased = AA->alias(Loc1, Loc2);
556     }
557     // Store the result in the cache.
558     result = aliased;
559     return aliased;
560   }
561 
562   typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
563 
564   /// Cache for alias results.
565   /// TODO: consider moving this to the AliasAnalysis itself.
566   DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
567 
568   /// Removes an instruction from its block and eventually deletes it.
569   /// It's like Instruction::eraseFromParent() except that the actual deletion
570   /// is delayed until BoUpSLP is destructed.
571   /// This is required to ensure that there are no incorrect collisions in the
572   /// AliasCache, which can happen if a new instruction is allocated at the
573   /// same address as a previously deleted instruction.
574   void eraseInstruction(Instruction *I) {
575     I->removeFromParent();
576     I->dropAllReferences();
577     DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
578   }
579 
580   /// Temporary store for deleted instructions. Instructions will be deleted
581   /// eventually when the BoUpSLP is destructed.
582   SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
583 
584   /// A list of values that need to extracted out of the tree.
585   /// This list holds pairs of (Internal Scalar : External User). External User
586   /// can be nullptr, it means that this Internal Scalar will be used later,
587   /// after vectorization.
588   UserList ExternalUses;
589 
590   /// Values used only by @llvm.assume calls.
591   SmallPtrSet<const Value *, 32> EphValues;
592 
593   /// Holds all of the instructions that we gathered.
594   SetVector<Instruction *> GatherSeq;
595   /// A list of blocks that we are going to CSE.
596   SetVector<BasicBlock *> CSEBlocks;
597 
598   /// Contains all scheduling relevant data for an instruction.
599   /// A ScheduleData either represents a single instruction or a member of an
600   /// instruction bundle (= a group of instructions which is combined into a
601   /// vector instruction).
602   struct ScheduleData {
603 
604     // The initial value for the dependency counters. It means that the
605     // dependencies are not calculated yet.
606     enum { InvalidDeps = -1 };
607 
608     ScheduleData()
609         : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
610           NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
611           Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
612           UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
613 
614     void init(int BlockSchedulingRegionID) {
615       FirstInBundle = this;
616       NextInBundle = nullptr;
617       NextLoadStore = nullptr;
618       IsScheduled = false;
619       SchedulingRegionID = BlockSchedulingRegionID;
620       UnscheduledDepsInBundle = UnscheduledDeps;
621       clearDependencies();
622     }
623 
624     /// Returns true if the dependency information has been calculated.
625     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
626 
627     /// Returns true for single instructions and for bundle representatives
628     /// (= the head of a bundle).
629     bool isSchedulingEntity() const { return FirstInBundle == this; }
630 
631     /// Returns true if it represents an instruction bundle and not only a
632     /// single instruction.
633     bool isPartOfBundle() const {
634       return NextInBundle != nullptr || FirstInBundle != this;
635     }
636 
637     /// Returns true if it is ready for scheduling, i.e. it has no more
638     /// unscheduled depending instructions/bundles.
639     bool isReady() const {
640       assert(isSchedulingEntity() &&
641              "can't consider non-scheduling entity for ready list");
642       return UnscheduledDepsInBundle == 0 && !IsScheduled;
643     }
644 
645     /// Modifies the number of unscheduled dependencies, also updating it for
646     /// the whole bundle.
647     int incrementUnscheduledDeps(int Incr) {
648       UnscheduledDeps += Incr;
649       return FirstInBundle->UnscheduledDepsInBundle += Incr;
650     }
651 
652     /// Sets the number of unscheduled dependencies to the number of
653     /// dependencies.
654     void resetUnscheduledDeps() {
655       incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
656     }
657 
658     /// Clears all dependency information.
659     void clearDependencies() {
660       Dependencies = InvalidDeps;
661       resetUnscheduledDeps();
662       MemoryDependencies.clear();
663     }
664 
665     void dump(raw_ostream &os) const {
666       if (!isSchedulingEntity()) {
667         os << "/ " << *Inst;
668       } else if (NextInBundle) {
669         os << '[' << *Inst;
670         ScheduleData *SD = NextInBundle;
671         while (SD) {
672           os << ';' << *SD->Inst;
673           SD = SD->NextInBundle;
674         }
675         os << ']';
676       } else {
677         os << *Inst;
678       }
679     }
680 
681     Instruction *Inst;
682 
683     /// Points to the head in an instruction bundle (and always to this for
684     /// single instructions).
685     ScheduleData *FirstInBundle;
686 
687     /// Single linked list of all instructions in a bundle. Null if it is a
688     /// single instruction.
689     ScheduleData *NextInBundle;
690 
691     /// Single linked list of all memory instructions (e.g. load, store, call)
692     /// in the block - until the end of the scheduling region.
693     ScheduleData *NextLoadStore;
694 
695     /// The dependent memory instructions.
696     /// This list is derived on demand in calculateDependencies().
697     SmallVector<ScheduleData *, 4> MemoryDependencies;
698 
699     /// This ScheduleData is in the current scheduling region if this matches
700     /// the current SchedulingRegionID of BlockScheduling.
701     int SchedulingRegionID;
702 
703     /// Used for getting a "good" final ordering of instructions.
704     int SchedulingPriority;
705 
706     /// The number of dependencies. Constitutes of the number of users of the
707     /// instruction plus the number of dependent memory instructions (if any).
708     /// This value is calculated on demand.
709     /// If InvalidDeps, the number of dependencies is not calculated yet.
710     ///
711     int Dependencies;
712 
713     /// The number of dependencies minus the number of dependencies of scheduled
714     /// instructions. As soon as this is zero, the instruction/bundle gets ready
715     /// for scheduling.
716     /// Note that this is negative as long as Dependencies is not calculated.
717     int UnscheduledDeps;
718 
719     /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
720     /// single instructions.
721     int UnscheduledDepsInBundle;
722 
723     /// True if this instruction is scheduled (or considered as scheduled in the
724     /// dry-run).
725     bool IsScheduled;
726   };
727 
728 #ifndef NDEBUG
729   friend inline raw_ostream &operator<<(raw_ostream &os,
730                                         const BoUpSLP::ScheduleData &SD) {
731     SD.dump(os);
732     return os;
733   }
734 #endif
735   friend struct GraphTraits<BoUpSLP *>;
736   friend struct DOTGraphTraits<BoUpSLP *>;
737 
738   /// Contains all scheduling data for a basic block.
739   ///
740   struct BlockScheduling {
741 
742     BlockScheduling(BasicBlock *BB)
743         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
744           ScheduleStart(nullptr), ScheduleEnd(nullptr),
745           FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
746           ScheduleRegionSize(0),
747           ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
748           // Make sure that the initial SchedulingRegionID is greater than the
749           // initial SchedulingRegionID in ScheduleData (which is 0).
750           SchedulingRegionID(1) {}
751 
752     void clear() {
753       ReadyInsts.clear();
754       ScheduleStart = nullptr;
755       ScheduleEnd = nullptr;
756       FirstLoadStoreInRegion = nullptr;
757       LastLoadStoreInRegion = nullptr;
758 
759       // Reduce the maximum schedule region size by the size of the
760       // previous scheduling run.
761       ScheduleRegionSizeLimit -= ScheduleRegionSize;
762       if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
763         ScheduleRegionSizeLimit = MinScheduleRegionSize;
764       ScheduleRegionSize = 0;
765 
766       // Make a new scheduling region, i.e. all existing ScheduleData is not
767       // in the new region yet.
768       ++SchedulingRegionID;
769     }
770 
771     ScheduleData *getScheduleData(Value *V) {
772       ScheduleData *SD = ScheduleDataMap[V];
773       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
774         return SD;
775       return nullptr;
776     }
777 
778     bool isInSchedulingRegion(ScheduleData *SD) {
779       return SD->SchedulingRegionID == SchedulingRegionID;
780     }
781 
782     /// Marks an instruction as scheduled and puts all dependent ready
783     /// instructions into the ready-list.
784     template <typename ReadyListType>
785     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
786       SD->IsScheduled = true;
787       DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
788 
789       ScheduleData *BundleMember = SD;
790       while (BundleMember) {
791         // Handle the def-use chain dependencies.
792         for (Use &U : BundleMember->Inst->operands()) {
793           ScheduleData *OpDef = getScheduleData(U.get());
794           if (OpDef && OpDef->hasValidDependencies() &&
795               OpDef->incrementUnscheduledDeps(-1) == 0) {
796             // There are no more unscheduled dependencies after decrementing,
797             // so we can put the dependent instruction into the ready list.
798             ScheduleData *DepBundle = OpDef->FirstInBundle;
799             assert(!DepBundle->IsScheduled &&
800                    "already scheduled bundle gets ready");
801             ReadyList.insert(DepBundle);
802             DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n");
803           }
804         }
805         // Handle the memory dependencies.
806         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
807           if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
808             // There are no more unscheduled dependencies after decrementing,
809             // so we can put the dependent instruction into the ready list.
810             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
811             assert(!DepBundle->IsScheduled &&
812                    "already scheduled bundle gets ready");
813             ReadyList.insert(DepBundle);
814             DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n");
815           }
816         }
817         BundleMember = BundleMember->NextInBundle;
818       }
819     }
820 
821     /// Put all instructions into the ReadyList which are ready for scheduling.
822     template <typename ReadyListType>
823     void initialFillReadyList(ReadyListType &ReadyList) {
824       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
825         ScheduleData *SD = getScheduleData(I);
826         if (SD->isSchedulingEntity() && SD->isReady()) {
827           ReadyList.insert(SD);
828           DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
829         }
830       }
831     }
832 
833     /// Checks if a bundle of instructions can be scheduled, i.e. has no
834     /// cyclic dependencies. This is only a dry-run, no instructions are
835     /// actually moved at this stage.
836     bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
837 
838     /// Un-bundles a group of instructions.
839     void cancelScheduling(ArrayRef<Value *> VL);
840 
841     /// Extends the scheduling region so that V is inside the region.
842     /// \returns true if the region size is within the limit.
843     bool extendSchedulingRegion(Value *V);
844 
845     /// Initialize the ScheduleData structures for new instructions in the
846     /// scheduling region.
847     void initScheduleData(Instruction *FromI, Instruction *ToI,
848                           ScheduleData *PrevLoadStore,
849                           ScheduleData *NextLoadStore);
850 
851     /// Updates the dependency information of a bundle and of all instructions/
852     /// bundles which depend on the original bundle.
853     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
854                                BoUpSLP *SLP);
855 
856     /// Sets all instruction in the scheduling region to un-scheduled.
857     void resetSchedule();
858 
859     BasicBlock *BB;
860 
861     /// Simple memory allocation for ScheduleData.
862     std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
863 
864     /// The size of a ScheduleData array in ScheduleDataChunks.
865     int ChunkSize;
866 
867     /// The allocator position in the current chunk, which is the last entry
868     /// of ScheduleDataChunks.
869     int ChunkPos;
870 
871     /// Attaches ScheduleData to Instruction.
872     /// Note that the mapping survives during all vectorization iterations, i.e.
873     /// ScheduleData structures are recycled.
874     DenseMap<Value *, ScheduleData *> ScheduleDataMap;
875 
876     struct ReadyList : SmallVector<ScheduleData *, 8> {
877       void insert(ScheduleData *SD) { push_back(SD); }
878     };
879 
880     /// The ready-list for scheduling (only used for the dry-run).
881     ReadyList ReadyInsts;
882 
883     /// The first instruction of the scheduling region.
884     Instruction *ScheduleStart;
885 
886     /// The first instruction _after_ the scheduling region.
887     Instruction *ScheduleEnd;
888 
889     /// The first memory accessing instruction in the scheduling region
890     /// (can be null).
891     ScheduleData *FirstLoadStoreInRegion;
892 
893     /// The last memory accessing instruction in the scheduling region
894     /// (can be null).
895     ScheduleData *LastLoadStoreInRegion;
896 
897     /// The current size of the scheduling region.
898     int ScheduleRegionSize;
899 
900     /// The maximum size allowed for the scheduling region.
901     int ScheduleRegionSizeLimit;
902 
903     /// The ID of the scheduling region. For a new vectorization iteration this
904     /// is incremented which "removes" all ScheduleData from the region.
905     int SchedulingRegionID;
906   };
907 
908   /// Attaches the BlockScheduling structures to basic blocks.
909   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
910 
911   /// Performs the "real" scheduling. Done before vectorization is actually
912   /// performed in a basic block.
913   void scheduleBlock(BlockScheduling *BS);
914 
915   /// List of users to ignore during scheduling and that don't need extracting.
916   ArrayRef<Value *> UserIgnoreList;
917 
918   // Number of load bundles that contain consecutive loads.
919   int NumLoadsWantToKeepOrder;
920 
921   // Number of load bundles that contain consecutive loads in reversed order.
922   int NumLoadsWantToChangeOrder;
923 
924   // Analysis and block reference.
925   Function *F;
926   ScalarEvolution *SE;
927   TargetTransformInfo *TTI;
928   TargetLibraryInfo *TLI;
929   AliasAnalysis *AA;
930   LoopInfo *LI;
931   DominatorTree *DT;
932   AssumptionCache *AC;
933   DemandedBits *DB;
934   const DataLayout *DL;
935   OptimizationRemarkEmitter *ORE;
936 
937   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
938   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
939   /// Instruction builder to construct the vectorized tree.
940   IRBuilder<> Builder;
941 
942   /// A map of scalar integer values to the smallest bit width with which they
943   /// can legally be represented. The values map to (width, signed) pairs,
944   /// where "width" indicates the minimum bit width and "signed" is True if the
945   /// value must be signed-extended, rather than zero-extended, back to its
946   /// original width.
947   MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
948 };
949 } // end namespace slpvectorizer
950 
951 template <> struct GraphTraits<BoUpSLP *> {
952   typedef BoUpSLP::TreeEntry TreeEntry;
953 
954   /// NodeRef has to be a pointer per the GraphWriter.
955   typedef TreeEntry *NodeRef;
956 
957   /// \brief Add the VectorizableTree to the index iterator to be able to return
958   /// TreeEntry pointers.
959   struct ChildIteratorType
960       : public iterator_adaptor_base<ChildIteratorType,
961                                      SmallVector<int, 1>::iterator> {
962 
963     std::vector<TreeEntry> &VectorizableTree;
964 
965     ChildIteratorType(SmallVector<int, 1>::iterator W,
966                       std::vector<TreeEntry> &VT)
967         : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
968 
969     NodeRef operator*() { return &VectorizableTree[*I]; }
970   };
971 
972   static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
973 
974   static ChildIteratorType child_begin(NodeRef N) {
975     return {N->UserTreeIndices.begin(), N->Container};
976   }
977   static ChildIteratorType child_end(NodeRef N) {
978     return {N->UserTreeIndices.end(), N->Container};
979   }
980 
981   /// For the node iterator we just need to turn the TreeEntry iterator into a
982   /// TreeEntry* iterator so that it dereferences to NodeRef.
983   typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;
984 
985   static nodes_iterator nodes_begin(BoUpSLP *R) {
986     return nodes_iterator(R->VectorizableTree.begin());
987   }
988   static nodes_iterator nodes_end(BoUpSLP *R) {
989     return nodes_iterator(R->VectorizableTree.end());
990   }
991 
992   static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
993 };
994 
995 template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
996   typedef BoUpSLP::TreeEntry TreeEntry;
997 
998   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
999 
1000   std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
1001     std::string Str;
1002     raw_string_ostream OS(Str);
1003     if (isSplat(Entry->Scalars)) {
1004       OS << "<splat> " << *Entry->Scalars[0];
1005       return Str;
1006     }
1007     for (auto V : Entry->Scalars) {
1008       OS << *V;
1009       if (std::any_of(
1010               R->ExternalUses.begin(), R->ExternalUses.end(),
1011               [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
1012         OS << " <extract>";
1013       OS << "\n";
1014     }
1015     return Str;
1016   }
1017 
1018   static std::string getNodeAttributes(const TreeEntry *Entry,
1019                                        const BoUpSLP *) {
1020     if (Entry->NeedToGather)
1021       return "color=red";
1022     return "";
1023   }
1024 };
1025 
1026 } // end namespace llvm
1027 
1028 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
1029                         ArrayRef<Value *> UserIgnoreLst) {
1030   ExtraValueToDebugLocsMap ExternallyUsedValues;
1031   buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
1032 }
1033 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
1034                         ExtraValueToDebugLocsMap &ExternallyUsedValues,
1035                         ArrayRef<Value *> UserIgnoreLst) {
1036   deleteTree();
1037   UserIgnoreList = UserIgnoreLst;
1038   if (!allSameType(Roots))
1039     return;
1040   buildTree_rec(Roots, 0, -1);
1041 
1042   // Collect the values that we need to extract from the tree.
1043   for (TreeEntry &EIdx : VectorizableTree) {
1044     TreeEntry *Entry = &EIdx;
1045 
1046     // For each lane:
1047     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
1048       Value *Scalar = Entry->Scalars[Lane];
1049 
1050       // No need to handle users of gathered values.
1051       if (Entry->NeedToGather)
1052         continue;
1053 
1054       // Check if the scalar is externally used as an extra arg.
1055       auto ExtI = ExternallyUsedValues.find(Scalar);
1056       if (ExtI != ExternallyUsedValues.end()) {
1057         DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
1058               Lane << " from " << *Scalar << ".\n");
1059         ExternalUses.emplace_back(Scalar, nullptr, Lane);
1060         continue;
1061       }
1062       for (User *U : Scalar->users()) {
1063         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
1064 
1065         Instruction *UserInst = dyn_cast<Instruction>(U);
1066         if (!UserInst)
1067           continue;
1068 
1069         // Skip in-tree scalars that become vectors
1070         if (ScalarToTreeEntry.count(U)) {
1071           int Idx = ScalarToTreeEntry[U];
1072           TreeEntry *UseEntry = &VectorizableTree[Idx];
1073           Value *UseScalar = UseEntry->Scalars[0];
1074           // Some in-tree scalars will remain as scalar in vectorized
1075           // instructions. If that is the case, the one in Lane 0 will
1076           // be used.
1077           if (UseScalar != U ||
1078               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
1079             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
1080                          << ".\n");
1081             assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
1082             continue;
1083           }
1084         }
1085 
1086         // Ignore users in the user ignore list.
1087         if (is_contained(UserIgnoreList, UserInst))
1088           continue;
1089 
1090         DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
1091               Lane << " from " << *Scalar << ".\n");
1092         ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
1093       }
1094     }
1095   }
1096 }
1097 
1098 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
1099                             int UserTreeIdx) {
1100   bool isAltShuffle = false;
1101   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
1102 
1103   if (Depth == RecursionMaxDepth) {
1104     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
1105     newTreeEntry(VL, false, UserTreeIdx);
1106     return;
1107   }
1108 
1109   // Don't handle vectors.
1110   if (VL[0]->getType()->isVectorTy()) {
1111     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
1112     newTreeEntry(VL, false, UserTreeIdx);
1113     return;
1114   }
1115 
1116   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1117     if (SI->getValueOperand()->getType()->isVectorTy()) {
1118       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
1119       newTreeEntry(VL, false, UserTreeIdx);
1120       return;
1121     }
1122   unsigned Opcode = getSameOpcode(VL);
1123 
1124   // Check that this shuffle vector refers to the alternate
1125   // sequence of opcodes.
1126   if (Opcode == Instruction::ShuffleVector) {
1127     Instruction *I0 = dyn_cast<Instruction>(VL[0]);
1128     unsigned Op = I0->getOpcode();
1129     if (Op != Instruction::ShuffleVector)
1130       isAltShuffle = true;
1131   }
1132 
1133   // If all of the operands are identical or constant we have a simple solution.
1134   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
1135     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
1136     newTreeEntry(VL, false, UserTreeIdx);
1137     return;
1138   }
1139 
1140   // We now know that this is a vector of instructions of the same type from
1141   // the same block.
1142 
1143   // Don't vectorize ephemeral values.
1144   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1145     if (EphValues.count(VL[i])) {
1146       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1147             ") is ephemeral.\n");
1148       newTreeEntry(VL, false, UserTreeIdx);
1149       return;
1150     }
1151   }
1152 
1153   // Check if this is a duplicate of another entry.
1154   if (ScalarToTreeEntry.count(VL[0])) {
1155     int Idx = ScalarToTreeEntry[VL[0]];
1156     TreeEntry *E = &VectorizableTree[Idx];
1157     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1158       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1159       if (E->Scalars[i] != VL[i]) {
1160         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1161         newTreeEntry(VL, false, UserTreeIdx);
1162         return;
1163       }
1164     }
1165     // Record the reuse of the tree node.  FIXME, currently this is only used to
1166     // properly draw the graph rather than for the actual vectorization.
1167     E->UserTreeIndices.push_back(UserTreeIdx);
1168     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1169     return;
1170   }
1171 
1172   // Check that none of the instructions in the bundle are already in the tree.
1173   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1174     if (ScalarToTreeEntry.count(VL[i])) {
1175       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1176             ") is already in tree.\n");
1177       newTreeEntry(VL, false, UserTreeIdx);
1178       return;
1179     }
1180   }
1181 
1182   // If any of the scalars is marked as a value that needs to stay scalar then
1183   // we need to gather the scalars.
1184   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1185     if (MustGather.count(VL[i])) {
1186       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1187       newTreeEntry(VL, false, UserTreeIdx);
1188       return;
1189     }
1190   }
1191 
1192   // Check that all of the users of the scalars that we want to vectorize are
1193   // schedulable.
1194   Instruction *VL0 = cast<Instruction>(VL[0]);
1195   BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1196 
1197   if (!DT->isReachableFromEntry(BB)) {
1198     // Don't go into unreachable blocks. They may contain instructions with
1199     // dependency cycles which confuse the final scheduling.
1200     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1201     newTreeEntry(VL, false, UserTreeIdx);
1202     return;
1203   }
1204 
1205   // Check that every instructions appears once in this bundle.
1206   for (unsigned i = 0, e = VL.size(); i < e; ++i)
1207     for (unsigned j = i+1; j < e; ++j)
1208       if (VL[i] == VL[j]) {
1209         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1210         newTreeEntry(VL, false, UserTreeIdx);
1211         return;
1212       }
1213 
1214   auto &BSRef = BlocksSchedules[BB];
1215   if (!BSRef) {
1216     BSRef = llvm::make_unique<BlockScheduling>(BB);
1217   }
1218   BlockScheduling &BS = *BSRef.get();
1219 
1220   if (!BS.tryScheduleBundle(VL, this)) {
1221     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1222     assert((!BS.getScheduleData(VL[0]) ||
1223             !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
1224            "tryScheduleBundle should cancelScheduling on failure");
1225     newTreeEntry(VL, false, UserTreeIdx);
1226     return;
1227   }
1228   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1229 
1230   switch (Opcode) {
1231     case Instruction::PHI: {
1232       PHINode *PH = dyn_cast<PHINode>(VL0);
1233 
1234       // Check for terminator values (e.g. invoke).
1235       for (unsigned j = 0; j < VL.size(); ++j)
1236         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1237           TerminatorInst *Term = dyn_cast<TerminatorInst>(
1238               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1239           if (Term) {
1240             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1241             BS.cancelScheduling(VL);
1242             newTreeEntry(VL, false, UserTreeIdx);
1243             return;
1244           }
1245         }
1246 
1247       newTreeEntry(VL, true, UserTreeIdx);
1248       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1249 
1250       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1251         ValueList Operands;
1252         // Prepare the operand vector.
1253         for (Value *j : VL)
1254           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
1255               PH->getIncomingBlock(i)));
1256 
1257         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1258       }
1259       return;
1260     }
1261     case Instruction::ExtractValue:
1262     case Instruction::ExtractElement: {
1263       bool Reuse = canReuseExtract(VL, Opcode);
1264       if (Reuse) {
1265         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1266       } else {
1267         BS.cancelScheduling(VL);
1268       }
1269       newTreeEntry(VL, Reuse, UserTreeIdx);
1270       return;
1271     }
1272     case Instruction::Load: {
1273       // Check that a vectorized load would load the same memory as a scalar
1274       // load.
1275       // For example we don't want vectorize loads that are smaller than 8 bit.
1276       // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
1277       // loading/storing it as an i8 struct. If we vectorize loads/stores from
1278       // such a struct we read/write packed bits disagreeing with the
1279       // unvectorized version.
1280       Type *ScalarTy = VL[0]->getType();
1281 
1282       if (DL->getTypeSizeInBits(ScalarTy) !=
1283           DL->getTypeAllocSizeInBits(ScalarTy)) {
1284         BS.cancelScheduling(VL);
1285         newTreeEntry(VL, false, UserTreeIdx);
1286         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
1287         return;
1288       }
1289 
1290       // Make sure all loads in the bundle are simple - we can't vectorize
1291       // atomic or volatile loads.
1292       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1293         LoadInst *L = cast<LoadInst>(VL[i]);
1294         if (!L->isSimple()) {
1295           BS.cancelScheduling(VL);
1296           newTreeEntry(VL, false, UserTreeIdx);
1297           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1298           return;
1299         }
1300       }
1301 
1302       // Check if the loads are consecutive, reversed, or neither.
1303       // TODO: What we really want is to sort the loads, but for now, check
1304       // the two likely directions.
1305       bool Consecutive = true;
1306       bool ReverseConsecutive = true;
1307       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1308         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
1309           Consecutive = false;
1310           break;
1311         } else {
1312           ReverseConsecutive = false;
1313         }
1314       }
1315 
1316       if (Consecutive) {
1317         ++NumLoadsWantToKeepOrder;
1318         newTreeEntry(VL, true, UserTreeIdx);
1319         DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1320         return;
1321       }
1322 
1323       // If none of the load pairs were consecutive when checked in order,
1324       // check the reverse order.
1325       if (ReverseConsecutive)
1326         for (unsigned i = VL.size() - 1; i > 0; --i)
1327           if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
1328             ReverseConsecutive = false;
1329             break;
1330           }
1331 
1332       BS.cancelScheduling(VL);
1333       newTreeEntry(VL, false, UserTreeIdx);
1334 
1335       if (ReverseConsecutive) {
1336         ++NumLoadsWantToChangeOrder;
1337         DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
1338       } else {
1339         DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1340       }
1341       return;
1342     }
1343     case Instruction::ZExt:
1344     case Instruction::SExt:
1345     case Instruction::FPToUI:
1346     case Instruction::FPToSI:
1347     case Instruction::FPExt:
1348     case Instruction::PtrToInt:
1349     case Instruction::IntToPtr:
1350     case Instruction::SIToFP:
1351     case Instruction::UIToFP:
1352     case Instruction::Trunc:
1353     case Instruction::FPTrunc:
1354     case Instruction::BitCast: {
1355       Type *SrcTy = VL0->getOperand(0)->getType();
1356       for (unsigned i = 0; i < VL.size(); ++i) {
1357         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1358         if (Ty != SrcTy || !isValidElementType(Ty)) {
1359           BS.cancelScheduling(VL);
1360           newTreeEntry(VL, false, UserTreeIdx);
1361           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1362           return;
1363         }
1364       }
1365       newTreeEntry(VL, true, UserTreeIdx);
1366       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1367 
1368       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1369         ValueList Operands;
1370         // Prepare the operand vector.
1371         for (Value *j : VL)
1372           Operands.push_back(cast<Instruction>(j)->getOperand(i));
1373 
1374         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1375       }
1376       return;
1377     }
1378     case Instruction::ICmp:
1379     case Instruction::FCmp: {
1380       // Check that all of the compares have the same predicate.
1381       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
1382       Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1383       for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1384         CmpInst *Cmp = cast<CmpInst>(VL[i]);
1385         if (Cmp->getPredicate() != P0 ||
1386             Cmp->getOperand(0)->getType() != ComparedTy) {
1387           BS.cancelScheduling(VL);
1388           newTreeEntry(VL, false, UserTreeIdx);
1389           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1390           return;
1391         }
1392       }
1393 
1394       newTreeEntry(VL, true, UserTreeIdx);
1395       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1396 
1397       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1398         ValueList Operands;
1399         // Prepare the operand vector.
1400         for (Value *j : VL)
1401           Operands.push_back(cast<Instruction>(j)->getOperand(i));
1402 
1403         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1404       }
1405       return;
1406     }
1407     case Instruction::Select:
1408     case Instruction::Add:
1409     case Instruction::FAdd:
1410     case Instruction::Sub:
1411     case Instruction::FSub:
1412     case Instruction::Mul:
1413     case Instruction::FMul:
1414     case Instruction::UDiv:
1415     case Instruction::SDiv:
1416     case Instruction::FDiv:
1417     case Instruction::URem:
1418     case Instruction::SRem:
1419     case Instruction::FRem:
1420     case Instruction::Shl:
1421     case Instruction::LShr:
1422     case Instruction::AShr:
1423     case Instruction::And:
1424     case Instruction::Or:
1425     case Instruction::Xor: {
1426       newTreeEntry(VL, true, UserTreeIdx);
1427       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1428 
1429       // Sort operands of the instructions so that each side is more likely to
1430       // have the same opcode.
1431       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1432         ValueList Left, Right;
1433         reorderInputsAccordingToOpcode(VL, Left, Right);
1434         buildTree_rec(Left, Depth + 1, UserTreeIdx);
1435         buildTree_rec(Right, Depth + 1, UserTreeIdx);
1436         return;
1437       }
1438 
1439       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1440         ValueList Operands;
1441         // Prepare the operand vector.
1442         for (Value *j : VL)
1443           Operands.push_back(cast<Instruction>(j)->getOperand(i));
1444 
1445         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1446       }
1447       return;
1448     }
1449     case Instruction::GetElementPtr: {
1450       // We don't combine GEPs with complicated (nested) indexing.
1451       for (unsigned j = 0; j < VL.size(); ++j) {
1452         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1453           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1454           BS.cancelScheduling(VL);
1455           newTreeEntry(VL, false, UserTreeIdx);
1456           return;
1457         }
1458       }
1459 
1460       // We can't combine several GEPs into one vector if they operate on
1461       // different types.
1462       Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1463       for (unsigned j = 0; j < VL.size(); ++j) {
1464         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1465         if (Ty0 != CurTy) {
1466           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1467           BS.cancelScheduling(VL);
1468           newTreeEntry(VL, false, UserTreeIdx);
1469           return;
1470         }
1471       }
1472 
1473       // We don't combine GEPs with non-constant indexes.
1474       for (unsigned j = 0; j < VL.size(); ++j) {
1475         auto Op = cast<Instruction>(VL[j])->getOperand(1);
1476         if (!isa<ConstantInt>(Op)) {
1477           DEBUG(
1478               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1479           BS.cancelScheduling(VL);
1480           newTreeEntry(VL, false, UserTreeIdx);
1481           return;
1482         }
1483       }
1484 
1485       newTreeEntry(VL, true, UserTreeIdx);
1486       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1487       for (unsigned i = 0, e = 2; i < e; ++i) {
1488         ValueList Operands;
1489         // Prepare the operand vector.
1490         for (Value *j : VL)
1491           Operands.push_back(cast<Instruction>(j)->getOperand(i));
1492 
1493         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1494       }
1495       return;
1496     }
1497     case Instruction::Store: {
1498       // Check if the stores are consecutive or of we need to swizzle them.
1499       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1500         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
1501           BS.cancelScheduling(VL);
1502           newTreeEntry(VL, false, UserTreeIdx);
1503           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1504           return;
1505         }
1506 
1507       newTreeEntry(VL, true, UserTreeIdx);
1508       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1509 
1510       ValueList Operands;
1511       for (Value *j : VL)
1512         Operands.push_back(cast<Instruction>(j)->getOperand(0));
1513 
1514       buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1515       return;
1516     }
1517     case Instruction::Call: {
1518       // Check if the calls are all to the same vectorizable intrinsic.
1519       CallInst *CI = cast<CallInst>(VL[0]);
1520       // Check if this is an Intrinsic call or something that can be
1521       // represented by an intrinsic call
1522       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1523       if (!isTriviallyVectorizable(ID)) {
1524         BS.cancelScheduling(VL);
1525         newTreeEntry(VL, false, UserTreeIdx);
1526         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1527         return;
1528       }
1529       Function *Int = CI->getCalledFunction();
1530       Value *A1I = nullptr;
1531       if (hasVectorInstrinsicScalarOpd(ID, 1))
1532         A1I = CI->getArgOperand(1);
1533       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1534         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1535         if (!CI2 || CI2->getCalledFunction() != Int ||
1536             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
1537             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
1538           BS.cancelScheduling(VL);
1539           newTreeEntry(VL, false, UserTreeIdx);
1540           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1541                        << "\n");
1542           return;
1543         }
1544         // ctlz,cttz and powi are special intrinsics whose second argument
1545         // should be same in order for them to be vectorized.
1546         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1547           Value *A1J = CI2->getArgOperand(1);
1548           if (A1I != A1J) {
1549             BS.cancelScheduling(VL);
1550             newTreeEntry(VL, false, UserTreeIdx);
1551             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1552                          << " argument "<< A1I<<"!=" << A1J
1553                          << "\n");
1554             return;
1555           }
1556         }
1557         // Verify that the bundle operands are identical between the two calls.
1558         if (CI->hasOperandBundles() &&
1559             !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
1560                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
1561                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
1562           BS.cancelScheduling(VL);
1563           newTreeEntry(VL, false, UserTreeIdx);
1564           DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
1565                        << *VL[i] << '\n');
1566           return;
1567         }
1568       }
1569 
1570       newTreeEntry(VL, true, UserTreeIdx);
1571       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1572         ValueList Operands;
1573         // Prepare the operand vector.
1574         for (Value *j : VL) {
1575           CallInst *CI2 = dyn_cast<CallInst>(j);
1576           Operands.push_back(CI2->getArgOperand(i));
1577         }
1578         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1579       }
1580       return;
1581     }
1582     case Instruction::ShuffleVector: {
1583       // If this is not an alternate sequence of opcode like add-sub
1584       // then do not vectorize this instruction.
1585       if (!isAltShuffle) {
1586         BS.cancelScheduling(VL);
1587         newTreeEntry(VL, false, UserTreeIdx);
1588         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1589         return;
1590       }
1591       newTreeEntry(VL, true, UserTreeIdx);
1592       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1593 
1594       // Reorder operands if reordering would enable vectorization.
1595       if (isa<BinaryOperator>(VL0)) {
1596         ValueList Left, Right;
1597         reorderAltShuffleOperands(VL, Left, Right);
1598         buildTree_rec(Left, Depth + 1, UserTreeIdx);
1599         buildTree_rec(Right, Depth + 1, UserTreeIdx);
1600         return;
1601       }
1602 
1603       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1604         ValueList Operands;
1605         // Prepare the operand vector.
1606         for (Value *j : VL)
1607           Operands.push_back(cast<Instruction>(j)->getOperand(i));
1608 
1609         buildTree_rec(Operands, Depth + 1, UserTreeIdx);
1610       }
1611       return;
1612     }
1613     default:
1614       BS.cancelScheduling(VL);
1615       newTreeEntry(VL, false, UserTreeIdx);
1616       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1617       return;
1618   }
1619 }
1620 
1621 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
1622   unsigned N;
1623   Type *EltTy;
1624   auto *ST = dyn_cast<StructType>(T);
1625   if (ST) {
1626     N = ST->getNumElements();
1627     EltTy = *ST->element_begin();
1628   } else {
1629     N = cast<ArrayType>(T)->getNumElements();
1630     EltTy = cast<ArrayType>(T)->getElementType();
1631   }
1632   if (!isValidElementType(EltTy))
1633     return 0;
1634   uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
1635   if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
1636     return 0;
1637   if (ST) {
1638     // Check that struct is homogeneous.
1639     for (const auto *Ty : ST->elements())
1640       if (Ty != EltTy)
1641         return 0;
1642   }
1643   return N;
1644 }
1645 
1646 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
1647   assert(Opcode == Instruction::ExtractElement ||
1648          Opcode == Instruction::ExtractValue);
1649   assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
1650   // Check if all of the extracts come from the same vector and from the
1651   // correct offset.
1652   Value *VL0 = VL[0];
1653   Instruction *E0 = cast<Instruction>(VL0);
1654   Value *Vec = E0->getOperand(0);
1655 
1656   // We have to extract from a vector/aggregate with the same number of elements.
1657   unsigned NElts;
1658   if (Opcode == Instruction::ExtractValue) {
1659     const DataLayout &DL = E0->getModule()->getDataLayout();
1660     NElts = canMapToVector(Vec->getType(), DL);
1661     if (!NElts)
1662       return false;
1663     // Check if load can be rewritten as load of vector.
1664     LoadInst *LI = dyn_cast<LoadInst>(Vec);
1665     if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
1666       return false;
1667   } else {
1668     NElts = Vec->getType()->getVectorNumElements();
1669   }
1670 
1671   if (NElts != VL.size())
1672     return false;
1673 
1674   // Check that all of the indices extract from the correct offset.
1675   if (!matchExtractIndex(E0, 0, Opcode))
1676     return false;
1677 
1678   for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1679     Instruction *E = cast<Instruction>(VL[i]);
1680     if (!matchExtractIndex(E, i, Opcode))
1681       return false;
1682     if (E->getOperand(0) != Vec)
1683       return false;
1684   }
1685 
1686   return true;
1687 }
1688 
1689 int BoUpSLP::getEntryCost(TreeEntry *E) {
1690   ArrayRef<Value*> VL = E->Scalars;
1691 
1692   Type *ScalarTy = VL[0]->getType();
1693   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1694     ScalarTy = SI->getValueOperand()->getType();
1695   else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
1696     ScalarTy = CI->getOperand(0)->getType();
1697   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1698 
1699   // If we have computed a smaller type for the expression, update VecTy so
1700   // that the costs will be accurate.
1701   if (MinBWs.count(VL[0]))
1702     VecTy = VectorType::get(
1703         IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
1704 
1705   if (E->NeedToGather) {
1706     if (allConstant(VL))
1707       return 0;
1708     if (isSplat(VL)) {
1709       return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1710     }
1711     return getGatherCost(E->Scalars);
1712   }
1713   unsigned Opcode = getSameOpcode(VL);
1714   assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
1715   Instruction *VL0 = cast<Instruction>(VL[0]);
1716   switch (Opcode) {
1717     case Instruction::PHI: {
1718       return 0;
1719     }
1720     case Instruction::ExtractValue:
1721     case Instruction::ExtractElement: {
1722       if (canReuseExtract(VL, Opcode)) {
1723         int DeadCost = 0;
1724         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1725           Instruction *E = cast<Instruction>(VL[i]);
1726           // If all users are going to be vectorized, instruction can be
1727           // considered as dead.
1728           // The same, if have only one user, it will be vectorized for sure.
1729           if (E->hasOneUse() ||
1730               std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
1731                 return ScalarToTreeEntry.count(U) > 0;
1732               }))
1733             // Take credit for instruction that will become dead.
1734             DeadCost +=
1735                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1736         }
1737         return -DeadCost;
1738       }
1739       return getGatherCost(VecTy);
1740     }
1741     case Instruction::ZExt:
1742     case Instruction::SExt:
1743     case Instruction::FPToUI:
1744     case Instruction::FPToSI:
1745     case Instruction::FPExt:
1746     case Instruction::PtrToInt:
1747     case Instruction::IntToPtr:
1748     case Instruction::SIToFP:
1749     case Instruction::UIToFP:
1750     case Instruction::Trunc:
1751     case Instruction::FPTrunc:
1752     case Instruction::BitCast: {
1753       Type *SrcTy = VL0->getOperand(0)->getType();
1754 
1755       // Calculate the cost of this instruction.
1756       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1757                                                          VL0->getType(), SrcTy, VL0);
1758 
1759       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1760       int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
1761       return VecCost - ScalarCost;
1762     }
1763     case Instruction::FCmp:
1764     case Instruction::ICmp:
1765     case Instruction::Select: {
1766       // Calculate the cost of this instruction.
1767       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1768       int ScalarCost = VecTy->getNumElements() *
1769           TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
1770       int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
1771       return VecCost - ScalarCost;
1772     }
1773     case Instruction::Add:
1774     case Instruction::FAdd:
1775     case Instruction::Sub:
1776     case Instruction::FSub:
1777     case Instruction::Mul:
1778     case Instruction::FMul:
1779     case Instruction::UDiv:
1780     case Instruction::SDiv:
1781     case Instruction::FDiv:
1782     case Instruction::URem:
1783     case Instruction::SRem:
1784     case Instruction::FRem:
1785     case Instruction::Shl:
1786     case Instruction::LShr:
1787     case Instruction::AShr:
1788     case Instruction::And:
1789     case Instruction::Or:
1790     case Instruction::Xor: {
1791       // Certain instructions can be cheaper to vectorize if they have a
1792       // constant second vector operand.
1793       TargetTransformInfo::OperandValueKind Op1VK =
1794           TargetTransformInfo::OK_AnyValue;
1795       TargetTransformInfo::OperandValueKind Op2VK =
1796           TargetTransformInfo::OK_UniformConstantValue;
1797       TargetTransformInfo::OperandValueProperties Op1VP =
1798           TargetTransformInfo::OP_None;
1799       TargetTransformInfo::OperandValueProperties Op2VP =
1800           TargetTransformInfo::OP_None;
1801 
1802       // If all operands are exactly the same ConstantInt then set the
1803       // operand kind to OK_UniformConstantValue.
1804       // If instead not all operands are constants, then set the operand kind
1805       // to OK_AnyValue. If all operands are constants but not the same,
1806       // then set the operand kind to OK_NonUniformConstantValue.
1807       ConstantInt *CInt = nullptr;
1808       for (unsigned i = 0; i < VL.size(); ++i) {
1809         const Instruction *I = cast<Instruction>(VL[i]);
1810         if (!isa<ConstantInt>(I->getOperand(1))) {
1811           Op2VK = TargetTransformInfo::OK_AnyValue;
1812           break;
1813         }
1814         if (i == 0) {
1815           CInt = cast<ConstantInt>(I->getOperand(1));
1816           continue;
1817         }
1818         if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
1819             CInt != cast<ConstantInt>(I->getOperand(1)))
1820           Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1821       }
1822       // FIXME: Currently cost of model modification for division by power of
1823       // 2 is handled for X86 and AArch64. Add support for other targets.
1824       if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1825           CInt->getValue().isPowerOf2())
1826         Op2VP = TargetTransformInfo::OP_PowerOf2;
1827 
1828       SmallVector<const Value *, 4> Operands(VL0->operand_values());
1829       int ScalarCost =
1830           VecTy->getNumElements() *
1831           TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
1832                                       Op2VP, Operands);
1833       int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1834                                                 Op1VP, Op2VP, Operands);
1835       return VecCost - ScalarCost;
1836     }
1837     case Instruction::GetElementPtr: {
1838       TargetTransformInfo::OperandValueKind Op1VK =
1839           TargetTransformInfo::OK_AnyValue;
1840       TargetTransformInfo::OperandValueKind Op2VK =
1841           TargetTransformInfo::OK_UniformConstantValue;
1842 
1843       int ScalarCost =
1844           VecTy->getNumElements() *
1845           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1846       int VecCost =
1847           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1848 
1849       return VecCost - ScalarCost;
1850     }
1851     case Instruction::Load: {
1852       // Cost of wide load - cost of scalar loads.
1853       unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
1854       int ScalarLdCost = VecTy->getNumElements() *
1855           TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
1856       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
1857                                            VecTy, alignment, 0, VL0);
1858       return VecLdCost - ScalarLdCost;
1859     }
1860     case Instruction::Store: {
1861       // We know that we can merge the stores. Calculate the cost.
1862       unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
1863       int ScalarStCost = VecTy->getNumElements() *
1864           TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
1865       int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
1866                                            VecTy, alignment, 0, VL0);
1867       return VecStCost - ScalarStCost;
1868     }
1869     case Instruction::Call: {
1870       CallInst *CI = cast<CallInst>(VL0);
1871       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1872 
1873       // Calculate the cost of the scalar and vector calls.
1874       SmallVector<Type*, 4> ScalarTys;
1875       for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
1876         ScalarTys.push_back(CI->getArgOperand(op)->getType());
1877 
1878       FastMathFlags FMF;
1879       if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
1880         FMF = FPMO->getFastMathFlags();
1881 
1882       int ScalarCallCost = VecTy->getNumElements() *
1883           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
1884 
1885       SmallVector<Value *, 4> Args(CI->arg_operands());
1886       int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
1887                                                    VecTy->getNumElements());
1888 
1889       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1890             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
1891             << " for " << *CI << "\n");
1892 
1893       return VecCallCost - ScalarCallCost;
1894     }
1895     case Instruction::ShuffleVector: {
1896       TargetTransformInfo::OperandValueKind Op1VK =
1897           TargetTransformInfo::OK_AnyValue;
1898       TargetTransformInfo::OperandValueKind Op2VK =
1899           TargetTransformInfo::OK_AnyValue;
1900       int ScalarCost = 0;
1901       int VecCost = 0;
1902       for (Value *i : VL) {
1903         Instruction *I = cast<Instruction>(i);
1904         if (!I)
1905           break;
1906         ScalarCost +=
1907             TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1908       }
1909       // VecCost is equal to sum of the cost of creating 2 vectors
1910       // and the cost of creating shuffle.
1911       Instruction *I0 = cast<Instruction>(VL[0]);
1912       VecCost =
1913           TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1914       Instruction *I1 = cast<Instruction>(VL[1]);
1915       VecCost +=
1916           TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1917       VecCost +=
1918           TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1919       return VecCost - ScalarCost;
1920     }
1921     default:
1922       llvm_unreachable("Unknown instruction");
1923   }
1924 }
1925 
1926 bool BoUpSLP::isFullyVectorizableTinyTree() {
1927   DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1928         VectorizableTree.size() << " is fully vectorizable .\n");
1929 
1930   // We only handle trees of heights 1 and 2.
1931   if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
1932     return true;
1933 
1934   if (VectorizableTree.size() != 2)
1935     return false;
1936 
1937   // Handle splat and all-constants stores.
1938   if (!VectorizableTree[0].NeedToGather &&
1939       (allConstant(VectorizableTree[1].Scalars) ||
1940        isSplat(VectorizableTree[1].Scalars)))
1941     return true;
1942 
1943   // Gathering cost would be too much for tiny trees.
1944   if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1945     return false;
1946 
1947   return true;
1948 }
1949 
1950 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
1951 
1952   // We can vectorize the tree if its size is greater than or equal to the
1953   // minimum size specified by the MinTreeSize command line option.
1954   if (VectorizableTree.size() >= MinTreeSize)
1955     return false;
1956 
1957   // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
1958   // can vectorize it if we can prove it fully vectorizable.
1959   if (isFullyVectorizableTinyTree())
1960     return false;
1961 
1962   assert(VectorizableTree.empty()
1963              ? ExternalUses.empty()
1964              : true && "We shouldn't have any external users");
1965 
1966   // Otherwise, we can't vectorize the tree. It is both tiny and not fully
1967   // vectorizable.
1968   return true;
1969 }
1970 
1971 int BoUpSLP::getSpillCost() {
1972   // Walk from the bottom of the tree to the top, tracking which values are
1973   // live. When we see a call instruction that is not part of our tree,
1974   // query TTI to see if there is a cost to keeping values live over it
1975   // (for example, if spills and fills are required).
1976   unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1977   int Cost = 0;
1978 
1979   SmallPtrSet<Instruction*, 4> LiveValues;
1980   Instruction *PrevInst = nullptr;
1981 
1982   for (const auto &N : VectorizableTree) {
1983     Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
1984     if (!Inst)
1985       continue;
1986 
1987     if (!PrevInst) {
1988       PrevInst = Inst;
1989       continue;
1990     }
1991 
1992     // Update LiveValues.
1993     LiveValues.erase(PrevInst);
1994     for (auto &J : PrevInst->operands()) {
1995       if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1996         LiveValues.insert(cast<Instruction>(&*J));
1997     }
1998 
1999     DEBUG(
2000       dbgs() << "SLP: #LV: " << LiveValues.size();
2001       for (auto *X : LiveValues)
2002         dbgs() << " " << X->getName();
2003       dbgs() << ", Looking at ";
2004       Inst->dump();
2005       );
2006 
2007     // Now find the sequence of instructions between PrevInst and Inst.
2008     BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
2009                                  PrevInstIt =
2010                                      PrevInst->getIterator().getReverse();
2011     while (InstIt != PrevInstIt) {
2012       if (PrevInstIt == PrevInst->getParent()->rend()) {
2013         PrevInstIt = Inst->getParent()->rbegin();
2014         continue;
2015       }
2016 
2017       if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
2018         SmallVector<Type*, 4> V;
2019         for (auto *II : LiveValues)
2020           V.push_back(VectorType::get(II->getType(), BundleWidth));
2021         Cost += TTI->getCostOfKeepingLiveOverCall(V);
2022       }
2023 
2024       ++PrevInstIt;
2025     }
2026 
2027     PrevInst = Inst;
2028   }
2029 
2030   return Cost;
2031 }
2032 
2033 int BoUpSLP::getTreeCost() {
2034   int Cost = 0;
2035   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
2036         VectorizableTree.size() << ".\n");
2037 
2038   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
2039 
2040   for (TreeEntry &TE : VectorizableTree) {
2041     int C = getEntryCost(&TE);
2042     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
2043                  << *TE.Scalars[0] << ".\n");
2044     Cost += C;
2045   }
2046 
2047   SmallSet<Value *, 16> ExtractCostCalculated;
2048   int ExtractCost = 0;
2049   for (ExternalUser &EU : ExternalUses) {
2050     // We only add extract cost once for the same scalar.
2051     if (!ExtractCostCalculated.insert(EU.Scalar).second)
2052       continue;
2053 
2054     // Uses by ephemeral values are free (because the ephemeral value will be
2055     // removed prior to code generation, and so the extraction will be
2056     // removed as well).
2057     if (EphValues.count(EU.User))
2058       continue;
2059 
2060     // If we plan to rewrite the tree in a smaller type, we will need to sign
2061     // extend the extracted value back to the original type. Here, we account
2062     // for the extract and the added cost of the sign extend if needed.
2063     auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
2064     auto *ScalarRoot = VectorizableTree[0].Scalars[0];
2065     if (MinBWs.count(ScalarRoot)) {
2066       auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
2067       auto Extend =
2068           MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
2069       VecTy = VectorType::get(MinTy, BundleWidth);
2070       ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
2071                                                    VecTy, EU.Lane);
2072     } else {
2073       ExtractCost +=
2074           TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
2075     }
2076   }
2077 
2078   int SpillCost = getSpillCost();
2079   Cost += SpillCost + ExtractCost;
2080 
2081   std::string Str;
2082   {
2083     raw_string_ostream OS(Str);
2084     OS << "SLP: Spill Cost = " << SpillCost << ".\n"
2085        << "SLP: Extract Cost = " << ExtractCost << ".\n"
2086        << "SLP: Total Cost = " << Cost << ".\n";
2087   }
2088   DEBUG(dbgs() << Str);
2089 
2090   if (ViewSLPTree)
2091     ViewGraph(this, "SLP" + F->getName(), false, Str);
2092 
2093   return Cost;
2094 }
2095 
2096 int BoUpSLP::getGatherCost(Type *Ty) {
2097   int Cost = 0;
2098   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
2099     Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
2100   return Cost;
2101 }
2102 
2103 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
2104   // Find the type of the operands in VL.
2105   Type *ScalarTy = VL[0]->getType();
2106   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2107     ScalarTy = SI->getValueOperand()->getType();
2108   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2109   // Find the cost of inserting/extracting values from the vector.
2110   return getGatherCost(VecTy);
2111 }
2112 
2113 // Reorder commutative operations in alternate shuffle if the resulting vectors
2114 // are consecutive loads. This would allow us to vectorize the tree.
2115 // If we have something like-
2116 // load a[0] - load b[0]
2117 // load b[1] + load a[1]
2118 // load a[2] - load b[2]
2119 // load a[3] + load b[3]
2120 // Reordering the second load b[1]  load a[1] would allow us to vectorize this
2121 // code.
2122 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
2123                                         SmallVectorImpl<Value *> &Left,
2124                                         SmallVectorImpl<Value *> &Right) {
2125   // Push left and right operands of binary operation into Left and Right
2126   for (Value *i : VL) {
2127     Left.push_back(cast<Instruction>(i)->getOperand(0));
2128     Right.push_back(cast<Instruction>(i)->getOperand(1));
2129   }
2130 
2131   // Reorder if we have a commutative operation and consecutive access
2132   // are on either side of the alternate instructions.
2133   for (unsigned j = 0; j < VL.size() - 1; ++j) {
2134     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2135       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2136         Instruction *VL1 = cast<Instruction>(VL[j]);
2137         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
2138         if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
2139           std::swap(Left[j], Right[j]);
2140           continue;
2141         } else if (VL2->isCommutative() &&
2142                    isConsecutiveAccess(L, L1, *DL, *SE)) {
2143           std::swap(Left[j + 1], Right[j + 1]);
2144           continue;
2145         }
2146         // else unchanged
2147       }
2148     }
2149     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2150       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2151         Instruction *VL1 = cast<Instruction>(VL[j]);
2152         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
2153         if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
2154           std::swap(Left[j], Right[j]);
2155           continue;
2156         } else if (VL2->isCommutative() &&
2157                    isConsecutiveAccess(L, L1, *DL, *SE)) {
2158           std::swap(Left[j + 1], Right[j + 1]);
2159           continue;
2160         }
2161         // else unchanged
2162       }
2163     }
2164   }
2165 }
2166 
2167 // Return true if I should be commuted before adding it's left and right
2168 // operands to the arrays Left and Right.
2169 //
2170 // The vectorizer is trying to either have all elements one side being
2171 // instruction with the same opcode to enable further vectorization, or having
2172 // a splat to lower the vectorizing cost.
2173 static bool shouldReorderOperands(int i, Instruction &I,
2174                                   SmallVectorImpl<Value *> &Left,
2175                                   SmallVectorImpl<Value *> &Right,
2176                                   bool AllSameOpcodeLeft,
2177                                   bool AllSameOpcodeRight, bool SplatLeft,
2178                                   bool SplatRight) {
2179   Value *VLeft = I.getOperand(0);
2180   Value *VRight = I.getOperand(1);
2181   // If we have "SplatRight", try to see if commuting is needed to preserve it.
2182   if (SplatRight) {
2183     if (VRight == Right[i - 1])
2184       // Preserve SplatRight
2185       return false;
2186     if (VLeft == Right[i - 1]) {
2187       // Commuting would preserve SplatRight, but we don't want to break
2188       // SplatLeft either, i.e. preserve the original order if possible.
2189       // (FIXME: why do we care?)
2190       if (SplatLeft && VLeft == Left[i - 1])
2191         return false;
2192       return true;
2193     }
2194   }
2195   // Symmetrically handle Right side.
2196   if (SplatLeft) {
2197     if (VLeft == Left[i - 1])
2198       // Preserve SplatLeft
2199       return false;
2200     if (VRight == Left[i - 1])
2201       return true;
2202   }
2203 
2204   Instruction *ILeft = dyn_cast<Instruction>(VLeft);
2205   Instruction *IRight = dyn_cast<Instruction>(VRight);
2206 
2207   // If we have "AllSameOpcodeRight", try to see if the left operands preserves
2208   // it and not the right, in this case we want to commute.
2209   if (AllSameOpcodeRight) {
2210     unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
2211     if (IRight && RightPrevOpcode == IRight->getOpcode())
2212       // Do not commute, a match on the right preserves AllSameOpcodeRight
2213       return false;
2214     if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
2215       // We have a match and may want to commute, but first check if there is
2216       // not also a match on the existing operands on the Left to preserve
2217       // AllSameOpcodeLeft, i.e. preserve the original order if possible.
2218       // (FIXME: why do we care?)
2219       if (AllSameOpcodeLeft && ILeft &&
2220           cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
2221         return false;
2222       return true;
2223     }
2224   }
2225   // Symmetrically handle Left side.
2226   if (AllSameOpcodeLeft) {
2227     unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
2228     if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
2229       return false;
2230     if (IRight && LeftPrevOpcode == IRight->getOpcode())
2231       return true;
2232   }
2233   return false;
2234 }
2235 
2236 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2237                                              SmallVectorImpl<Value *> &Left,
2238                                              SmallVectorImpl<Value *> &Right) {
2239 
2240   if (VL.size()) {
2241     // Peel the first iteration out of the loop since there's nothing
2242     // interesting to do anyway and it simplifies the checks in the loop.
2243     auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
2244     auto VRight = cast<Instruction>(VL[0])->getOperand(1);
2245     if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
2246       // Favor having instruction to the right. FIXME: why?
2247       std::swap(VLeft, VRight);
2248     Left.push_back(VLeft);
2249     Right.push_back(VRight);
2250   }
2251 
2252   // Keep track if we have instructions with all the same opcode on one side.
2253   bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
2254   bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
2255   // Keep track if we have one side with all the same value (broadcast).
2256   bool SplatLeft = true;
2257   bool SplatRight = true;
2258 
2259   for (unsigned i = 1, e = VL.size(); i != e; ++i) {
2260     Instruction *I = cast<Instruction>(VL[i]);
2261     assert(I->isCommutative() && "Can only process commutative instruction");
2262     // Commute to favor either a splat or maximizing having the same opcodes on
2263     // one side.
2264     if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
2265                               AllSameOpcodeRight, SplatLeft, SplatRight)) {
2266       Left.push_back(I->getOperand(1));
2267       Right.push_back(I->getOperand(0));
2268     } else {
2269       Left.push_back(I->getOperand(0));
2270       Right.push_back(I->getOperand(1));
2271     }
2272     // Update Splat* and AllSameOpcode* after the insertion.
2273     SplatRight = SplatRight && (Right[i - 1] == Right[i]);
2274     SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
2275     AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
2276                         (cast<Instruction>(Left[i - 1])->getOpcode() ==
2277                          cast<Instruction>(Left[i])->getOpcode());
2278     AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
2279                          (cast<Instruction>(Right[i - 1])->getOpcode() ==
2280                           cast<Instruction>(Right[i])->getOpcode());
2281   }
2282 
2283   // If one operand end up being broadcast, return this operand order.
2284   if (SplatRight || SplatLeft)
2285     return;
2286 
2287   // Finally check if we can get longer vectorizable chain by reordering
2288   // without breaking the good operand order detected above.
2289   // E.g. If we have something like-
2290   // load a[0]  load b[0]
2291   // load b[1]  load a[1]
2292   // load a[2]  load b[2]
2293   // load a[3]  load b[3]
2294   // Reordering the second load b[1]  load a[1] would allow us to vectorize
2295   // this code and we still retain AllSameOpcode property.
2296   // FIXME: This load reordering might break AllSameOpcode in some rare cases
2297   // such as-
2298   // add a[0],c[0]  load b[0]
2299   // add a[1],c[2]  load b[1]
2300   // b[2]           load b[2]
2301   // add a[3],c[3]  load b[3]
2302   for (unsigned j = 0; j < VL.size() - 1; ++j) {
2303     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2304       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2305         if (isConsecutiveAccess(L, L1, *DL, *SE)) {
2306           std::swap(Left[j + 1], Right[j + 1]);
2307           continue;
2308         }
2309       }
2310     }
2311     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2312       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2313         if (isConsecutiveAccess(L, L1, *DL, *SE)) {
2314           std::swap(Left[j + 1], Right[j + 1]);
2315           continue;
2316         }
2317       }
2318     }
2319     // else unchanged
2320   }
2321 }
2322 
2323 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
2324 
2325   // Get the basic block this bundle is in. All instructions in the bundle
2326   // should be in this block.
2327   auto *Front = cast<Instruction>(VL.front());
2328   auto *BB = Front->getParent();
2329   assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {
2330     return cast<Instruction>(V)->getParent() == BB;
2331   }));
2332 
2333   // The last instruction in the bundle in program order.
2334   Instruction *LastInst = nullptr;
2335 
2336   // Find the last instruction. The common case should be that BB has been
2337   // scheduled, and the last instruction is VL.back(). So we start with
2338   // VL.back() and iterate over schedule data until we reach the end of the
2339   // bundle. The end of the bundle is marked by null ScheduleData.
2340   if (BlocksSchedules.count(BB)) {
2341     auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());
2342     if (Bundle && Bundle->isPartOfBundle())
2343       for (; Bundle; Bundle = Bundle->NextInBundle)
2344         LastInst = Bundle->Inst;
2345   }
2346 
2347   // LastInst can still be null at this point if there's either not an entry
2348   // for BB in BlocksSchedules or there's no ScheduleData available for
2349   // VL.back(). This can be the case if buildTree_rec aborts for various
2350   // reasons (e.g., the maximum recursion depth is reached, the maximum region
2351   // size is reached, etc.). ScheduleData is initialized in the scheduling
2352   // "dry-run".
2353   //
2354   // If this happens, we can still find the last instruction by brute force. We
2355   // iterate forwards from Front (inclusive) until we either see all
2356   // instructions in the bundle or reach the end of the block. If Front is the
2357   // last instruction in program order, LastInst will be set to Front, and we
2358   // will visit all the remaining instructions in the block.
2359   //
2360   // One of the reasons we exit early from buildTree_rec is to place an upper
2361   // bound on compile-time. Thus, taking an additional compile-time hit here is
2362   // not ideal. However, this should be exceedingly rare since it requires that
2363   // we both exit early from buildTree_rec and that the bundle be out-of-order
2364   // (causing us to iterate all the way to the end of the block).
2365   if (!LastInst) {
2366     SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
2367     for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
2368       if (Bundle.erase(&I))
2369         LastInst = &I;
2370       if (Bundle.empty())
2371         break;
2372     }
2373   }
2374 
2375   // Set the insertion point after the last instruction in the bundle. Set the
2376   // debug location to Front.
2377   Builder.SetInsertPoint(BB, ++LastInst->getIterator());
2378   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
2379 }
2380 
2381 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
2382   Value *Vec = UndefValue::get(Ty);
2383   // Generate the 'InsertElement' instruction.
2384   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
2385     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2386     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2387       GatherSeq.insert(Insrt);
2388       CSEBlocks.insert(Insrt->getParent());
2389 
2390       // Add to our 'need-to-extract' list.
2391       if (ScalarToTreeEntry.count(VL[i])) {
2392         int Idx = ScalarToTreeEntry[VL[i]];
2393         TreeEntry *E = &VectorizableTree[Idx];
2394         // Find which lane we need to extract.
2395         int FoundLane = -1;
2396         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
2397           // Is this the lane of the scalar that we are looking for ?
2398           if (E->Scalars[Lane] == VL[i]) {
2399             FoundLane = Lane;
2400             break;
2401           }
2402         }
2403         assert(FoundLane >= 0 && "Could not find the correct lane");
2404         ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2405       }
2406     }
2407   }
2408 
2409   return Vec;
2410 }
2411 
2412 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
2413   SmallDenseMap<Value*, int>::const_iterator Entry
2414     = ScalarToTreeEntry.find(VL[0]);
2415   if (Entry != ScalarToTreeEntry.end()) {
2416     int Idx = Entry->second;
2417     const TreeEntry *En = &VectorizableTree[Idx];
2418     if (En->isSame(VL) && En->VectorizedValue)
2419       return En->VectorizedValue;
2420   }
2421   return nullptr;
2422 }
2423 
2424 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
2425   if (ScalarToTreeEntry.count(VL[0])) {
2426     int Idx = ScalarToTreeEntry[VL[0]];
2427     TreeEntry *E = &VectorizableTree[Idx];
2428     if (E->isSame(VL))
2429       return vectorizeTree(E);
2430   }
2431 
2432   Type *ScalarTy = VL[0]->getType();
2433   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2434     ScalarTy = SI->getValueOperand()->getType();
2435   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2436 
2437   return Gather(VL, VecTy);
2438 }
2439 
2440 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2441   IRBuilder<>::InsertPointGuard Guard(Builder);
2442 
2443   if (E->VectorizedValue) {
2444     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
2445     return E->VectorizedValue;
2446   }
2447 
2448   Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2449   Type *ScalarTy = VL0->getType();
2450   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
2451     ScalarTy = SI->getValueOperand()->getType();
2452   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
2453 
2454   if (E->NeedToGather) {
2455     setInsertPointAfterBundle(E->Scalars);
2456     auto *V = Gather(E->Scalars, VecTy);
2457     E->VectorizedValue = V;
2458     return V;
2459   }
2460 
2461   unsigned Opcode = getSameOpcode(E->Scalars);
2462 
2463   switch (Opcode) {
2464     case Instruction::PHI: {
2465       PHINode *PH = dyn_cast<PHINode>(VL0);
2466       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
2467       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2468       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
2469       E->VectorizedValue = NewPhi;
2470 
2471       // PHINodes may have multiple entries from the same block. We want to
2472       // visit every block once.
2473       SmallSet<BasicBlock*, 4> VisitedBBs;
2474 
2475       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2476         ValueList Operands;
2477         BasicBlock *IBB = PH->getIncomingBlock(i);
2478 
2479         if (!VisitedBBs.insert(IBB).second) {
2480           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
2481           continue;
2482         }
2483 
2484         // Prepare the operand vector.
2485         for (Value *V : E->Scalars)
2486           Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2487 
2488         Builder.SetInsertPoint(IBB->getTerminator());
2489         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2490         Value *Vec = vectorizeTree(Operands);
2491         NewPhi->addIncoming(Vec, IBB);
2492       }
2493 
2494       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
2495              "Invalid number of incoming values");
2496       return NewPhi;
2497     }
2498 
2499     case Instruction::ExtractElement: {
2500       if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
2501         Value *V = VL0->getOperand(0);
2502         E->VectorizedValue = V;
2503         return V;
2504       }
2505       setInsertPointAfterBundle(E->Scalars);
2506       auto *V = Gather(E->Scalars, VecTy);
2507       E->VectorizedValue = V;
2508       return V;
2509     }
2510     case Instruction::ExtractValue: {
2511       if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
2512         LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
2513         Builder.SetInsertPoint(LI);
2514         PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
2515         Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
2516         LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
2517         E->VectorizedValue = V;
2518         return propagateMetadata(V, E->Scalars);
2519       }
2520       setInsertPointAfterBundle(E->Scalars);
2521       auto *V = Gather(E->Scalars, VecTy);
2522       E->VectorizedValue = V;
2523       return V;
2524     }
2525     case Instruction::ZExt:
2526     case Instruction::SExt:
2527     case Instruction::FPToUI:
2528     case Instruction::FPToSI:
2529     case Instruction::FPExt:
2530     case Instruction::PtrToInt:
2531     case Instruction::IntToPtr:
2532     case Instruction::SIToFP:
2533     case Instruction::UIToFP:
2534     case Instruction::Trunc:
2535     case Instruction::FPTrunc:
2536     case Instruction::BitCast: {
2537       ValueList INVL;
2538       for (Value *V : E->Scalars)
2539         INVL.push_back(cast<Instruction>(V)->getOperand(0));
2540 
2541       setInsertPointAfterBundle(E->Scalars);
2542 
2543       Value *InVec = vectorizeTree(INVL);
2544 
2545       if (Value *V = alreadyVectorized(E->Scalars))
2546         return V;
2547 
2548       CastInst *CI = dyn_cast<CastInst>(VL0);
2549       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2550       E->VectorizedValue = V;
2551       ++NumVectorInstructions;
2552       return V;
2553     }
2554     case Instruction::FCmp:
2555     case Instruction::ICmp: {
2556       ValueList LHSV, RHSV;
2557       for (Value *V : E->Scalars) {
2558         LHSV.push_back(cast<Instruction>(V)->getOperand(0));
2559         RHSV.push_back(cast<Instruction>(V)->getOperand(1));
2560       }
2561 
2562       setInsertPointAfterBundle(E->Scalars);
2563 
2564       Value *L = vectorizeTree(LHSV);
2565       Value *R = vectorizeTree(RHSV);
2566 
2567       if (Value *V = alreadyVectorized(E->Scalars))
2568         return V;
2569 
2570       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2571       Value *V;
2572       if (Opcode == Instruction::FCmp)
2573         V = Builder.CreateFCmp(P0, L, R);
2574       else
2575         V = Builder.CreateICmp(P0, L, R);
2576 
2577       E->VectorizedValue = V;
2578       propagateIRFlags(E->VectorizedValue, E->Scalars);
2579       ++NumVectorInstructions;
2580       return V;
2581     }
2582     case Instruction::Select: {
2583       ValueList TrueVec, FalseVec, CondVec;
2584       for (Value *V : E->Scalars) {
2585         CondVec.push_back(cast<Instruction>(V)->getOperand(0));
2586         TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
2587         FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
2588       }
2589 
2590       setInsertPointAfterBundle(E->Scalars);
2591 
2592       Value *Cond = vectorizeTree(CondVec);
2593       Value *True = vectorizeTree(TrueVec);
2594       Value *False = vectorizeTree(FalseVec);
2595 
2596       if (Value *V = alreadyVectorized(E->Scalars))
2597         return V;
2598 
2599       Value *V = Builder.CreateSelect(Cond, True, False);
2600       E->VectorizedValue = V;
2601       ++NumVectorInstructions;
2602       return V;
2603     }
2604     case Instruction::Add:
2605     case Instruction::FAdd:
2606     case Instruction::Sub:
2607     case Instruction::FSub:
2608     case Instruction::Mul:
2609     case Instruction::FMul:
2610     case Instruction::UDiv:
2611     case Instruction::SDiv:
2612     case Instruction::FDiv:
2613     case Instruction::URem:
2614     case Instruction::SRem:
2615     case Instruction::FRem:
2616     case Instruction::Shl:
2617     case Instruction::LShr:
2618     case Instruction::AShr:
2619     case Instruction::And:
2620     case Instruction::Or:
2621     case Instruction::Xor: {
2622       ValueList LHSVL, RHSVL;
2623       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2624         reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2625       else
2626         for (Value *V : E->Scalars) {
2627           LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
2628           RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
2629         }
2630 
2631       setInsertPointAfterBundle(E->Scalars);
2632 
2633       Value *LHS = vectorizeTree(LHSVL);
2634       Value *RHS = vectorizeTree(RHSVL);
2635 
2636       if (Value *V = alreadyVectorized(E->Scalars))
2637         return V;
2638 
2639       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2640       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2641       E->VectorizedValue = V;
2642       propagateIRFlags(E->VectorizedValue, E->Scalars);
2643       ++NumVectorInstructions;
2644 
2645       if (Instruction *I = dyn_cast<Instruction>(V))
2646         return propagateMetadata(I, E->Scalars);
2647 
2648       return V;
2649     }
2650     case Instruction::Load: {
2651       // Loads are inserted at the head of the tree because we don't want to
2652       // sink them all the way down past store instructions.
2653       setInsertPointAfterBundle(E->Scalars);
2654 
2655       LoadInst *LI = cast<LoadInst>(VL0);
2656       Type *ScalarLoadTy = LI->getType();
2657       unsigned AS = LI->getPointerAddressSpace();
2658 
2659       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2660                                             VecTy->getPointerTo(AS));
2661 
2662       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2663       // ExternalUses list to make sure that an extract will be generated in the
2664       // future.
2665       if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2666         ExternalUses.push_back(
2667             ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2668 
2669       unsigned Alignment = LI->getAlignment();
2670       LI = Builder.CreateLoad(VecPtr);
2671       if (!Alignment) {
2672         Alignment = DL->getABITypeAlignment(ScalarLoadTy);
2673       }
2674       LI->setAlignment(Alignment);
2675       E->VectorizedValue = LI;
2676       ++NumVectorInstructions;
2677       return propagateMetadata(LI, E->Scalars);
2678     }
2679     case Instruction::Store: {
2680       StoreInst *SI = cast<StoreInst>(VL0);
2681       unsigned Alignment = SI->getAlignment();
2682       unsigned AS = SI->getPointerAddressSpace();
2683 
2684       ValueList ValueOp;
2685       for (Value *V : E->Scalars)
2686         ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
2687 
2688       setInsertPointAfterBundle(E->Scalars);
2689 
2690       Value *VecValue = vectorizeTree(ValueOp);
2691       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2692                                             VecTy->getPointerTo(AS));
2693       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2694 
2695       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2696       // ExternalUses list to make sure that an extract will be generated in the
2697       // future.
2698       if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2699         ExternalUses.push_back(
2700             ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2701 
2702       if (!Alignment) {
2703         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
2704       }
2705       S->setAlignment(Alignment);
2706       E->VectorizedValue = S;
2707       ++NumVectorInstructions;
2708       return propagateMetadata(S, E->Scalars);
2709     }
2710     case Instruction::GetElementPtr: {
2711       setInsertPointAfterBundle(E->Scalars);
2712 
2713       ValueList Op0VL;
2714       for (Value *V : E->Scalars)
2715         Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
2716 
2717       Value *Op0 = vectorizeTree(Op0VL);
2718 
2719       std::vector<Value *> OpVecs;
2720       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2721            ++j) {
2722         ValueList OpVL;
2723         for (Value *V : E->Scalars)
2724           OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
2725 
2726         Value *OpVec = vectorizeTree(OpVL);
2727         OpVecs.push_back(OpVec);
2728       }
2729 
2730       Value *V = Builder.CreateGEP(
2731           cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
2732       E->VectorizedValue = V;
2733       ++NumVectorInstructions;
2734 
2735       if (Instruction *I = dyn_cast<Instruction>(V))
2736         return propagateMetadata(I, E->Scalars);
2737 
2738       return V;
2739     }
2740     case Instruction::Call: {
2741       CallInst *CI = cast<CallInst>(VL0);
2742       setInsertPointAfterBundle(E->Scalars);
2743       Function *FI;
2744       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
2745       Value *ScalarArg = nullptr;
2746       if (CI && (FI = CI->getCalledFunction())) {
2747         IID = FI->getIntrinsicID();
2748       }
2749       std::vector<Value *> OpVecs;
2750       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2751         ValueList OpVL;
2752         // ctlz,cttz and powi are special intrinsics whose second argument is
2753         // a scalar. This argument should not be vectorized.
2754         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2755           CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2756           ScalarArg = CEI->getArgOperand(j);
2757           OpVecs.push_back(CEI->getArgOperand(j));
2758           continue;
2759         }
2760         for (Value *V : E->Scalars) {
2761           CallInst *CEI = cast<CallInst>(V);
2762           OpVL.push_back(CEI->getArgOperand(j));
2763         }
2764 
2765         Value *OpVec = vectorizeTree(OpVL);
2766         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2767         OpVecs.push_back(OpVec);
2768       }
2769 
2770       Module *M = F->getParent();
2771       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2772       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2773       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2774       SmallVector<OperandBundleDef, 1> OpBundles;
2775       CI->getOperandBundlesAsDefs(OpBundles);
2776       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
2777 
2778       // The scalar argument uses an in-tree scalar so we add the new vectorized
2779       // call to ExternalUses list to make sure that an extract will be
2780       // generated in the future.
2781       if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2782         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2783 
2784       E->VectorizedValue = V;
2785       propagateIRFlags(E->VectorizedValue, E->Scalars);
2786       ++NumVectorInstructions;
2787       return V;
2788     }
2789     case Instruction::ShuffleVector: {
2790       ValueList LHSVL, RHSVL;
2791       assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
2792       reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
2793       setInsertPointAfterBundle(E->Scalars);
2794 
2795       Value *LHS = vectorizeTree(LHSVL);
2796       Value *RHS = vectorizeTree(RHSVL);
2797 
2798       if (Value *V = alreadyVectorized(E->Scalars))
2799         return V;
2800 
2801       // Create a vector of LHS op1 RHS
2802       BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2803       Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2804 
2805       // Create a vector of LHS op2 RHS
2806       Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2807       BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2808       Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2809 
2810       // Create shuffle to take alternate operations from the vector.
2811       // Also, gather up odd and even scalar ops to propagate IR flags to
2812       // each vector operation.
2813       ValueList OddScalars, EvenScalars;
2814       unsigned e = E->Scalars.size();
2815       SmallVector<Constant *, 8> Mask(e);
2816       for (unsigned i = 0; i < e; ++i) {
2817         if (i & 1) {
2818           Mask[i] = Builder.getInt32(e + i);
2819           OddScalars.push_back(E->Scalars[i]);
2820         } else {
2821           Mask[i] = Builder.getInt32(i);
2822           EvenScalars.push_back(E->Scalars[i]);
2823         }
2824       }
2825 
2826       Value *ShuffleMask = ConstantVector::get(Mask);
2827       propagateIRFlags(V0, EvenScalars);
2828       propagateIRFlags(V1, OddScalars);
2829 
2830       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2831       E->VectorizedValue = V;
2832       ++NumVectorInstructions;
2833       if (Instruction *I = dyn_cast<Instruction>(V))
2834         return propagateMetadata(I, E->Scalars);
2835 
2836       return V;
2837     }
2838     default:
2839     llvm_unreachable("unknown inst");
2840   }
2841   return nullptr;
2842 }
2843 
2844 Value *BoUpSLP::vectorizeTree() {
2845   ExtraValueToDebugLocsMap ExternallyUsedValues;
2846   return vectorizeTree(ExternallyUsedValues);
2847 }
2848 
2849 Value *
2850 BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
2851 
2852   // All blocks must be scheduled before any instructions are inserted.
2853   for (auto &BSIter : BlocksSchedules) {
2854     scheduleBlock(BSIter.second.get());
2855   }
2856 
2857   Builder.SetInsertPoint(&F->getEntryBlock().front());
2858   auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
2859 
2860   // If the vectorized tree can be rewritten in a smaller type, we truncate the
2861   // vectorized root. InstCombine will then rewrite the entire expression. We
2862   // sign extend the extracted values below.
2863   auto *ScalarRoot = VectorizableTree[0].Scalars[0];
2864   if (MinBWs.count(ScalarRoot)) {
2865     if (auto *I = dyn_cast<Instruction>(VectorRoot))
2866       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
2867     auto BundleWidth = VectorizableTree[0].Scalars.size();
2868     auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
2869     auto *VecTy = VectorType::get(MinTy, BundleWidth);
2870     auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
2871     VectorizableTree[0].VectorizedValue = Trunc;
2872   }
2873 
2874   DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2875 
2876   // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
2877   // specified by ScalarType.
2878   auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
2879     if (!MinBWs.count(ScalarRoot))
2880       return Ex;
2881     if (MinBWs[ScalarRoot].second)
2882       return Builder.CreateSExt(Ex, ScalarType);
2883     return Builder.CreateZExt(Ex, ScalarType);
2884   };
2885 
2886   // Extract all of the elements with the external uses.
2887   for (const auto &ExternalUse : ExternalUses) {
2888     Value *Scalar = ExternalUse.Scalar;
2889     llvm::User *User = ExternalUse.User;
2890 
2891     // Skip users that we already RAUW. This happens when one instruction
2892     // has multiple uses of the same value.
2893     if (User && !is_contained(Scalar->users(), User))
2894       continue;
2895     assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2896 
2897     int Idx = ScalarToTreeEntry[Scalar];
2898     TreeEntry *E = &VectorizableTree[Idx];
2899     assert(!E->NeedToGather && "Extracting from a gather list");
2900 
2901     Value *Vec = E->VectorizedValue;
2902     assert(Vec && "Can't find vectorizable value");
2903 
2904     Value *Lane = Builder.getInt32(ExternalUse.Lane);
2905     // If User == nullptr, the Scalar is used as extra arg. Generate
2906     // ExtractElement instruction and update the record for this scalar in
2907     // ExternallyUsedValues.
2908     if (!User) {
2909       assert(ExternallyUsedValues.count(Scalar) &&
2910              "Scalar with nullptr as an external user must be registered in "
2911              "ExternallyUsedValues map");
2912       if (auto *VecI = dyn_cast<Instruction>(Vec)) {
2913         Builder.SetInsertPoint(VecI->getParent(),
2914                                std::next(VecI->getIterator()));
2915       } else {
2916         Builder.SetInsertPoint(&F->getEntryBlock().front());
2917       }
2918       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2919       Ex = extend(ScalarRoot, Ex, Scalar->getType());
2920       CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
2921       auto &Locs = ExternallyUsedValues[Scalar];
2922       ExternallyUsedValues.insert({Ex, Locs});
2923       ExternallyUsedValues.erase(Scalar);
2924       continue;
2925     }
2926 
2927     // Generate extracts for out-of-tree users.
2928     // Find the insertion point for the extractelement lane.
2929     if (auto *VecI = dyn_cast<Instruction>(Vec)) {
2930       if (PHINode *PH = dyn_cast<PHINode>(User)) {
2931         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2932           if (PH->getIncomingValue(i) == Scalar) {
2933             TerminatorInst *IncomingTerminator =
2934                 PH->getIncomingBlock(i)->getTerminator();
2935             if (isa<CatchSwitchInst>(IncomingTerminator)) {
2936               Builder.SetInsertPoint(VecI->getParent(),
2937                                      std::next(VecI->getIterator()));
2938             } else {
2939               Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2940             }
2941             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2942             Ex = extend(ScalarRoot, Ex, Scalar->getType());
2943             CSEBlocks.insert(PH->getIncomingBlock(i));
2944             PH->setOperand(i, Ex);
2945           }
2946         }
2947       } else {
2948         Builder.SetInsertPoint(cast<Instruction>(User));
2949         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2950         Ex = extend(ScalarRoot, Ex, Scalar->getType());
2951         CSEBlocks.insert(cast<Instruction>(User)->getParent());
2952         User->replaceUsesOfWith(Scalar, Ex);
2953      }
2954     } else {
2955       Builder.SetInsertPoint(&F->getEntryBlock().front());
2956       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2957       Ex = extend(ScalarRoot, Ex, Scalar->getType());
2958       CSEBlocks.insert(&F->getEntryBlock());
2959       User->replaceUsesOfWith(Scalar, Ex);
2960     }
2961 
2962     DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2963   }
2964 
2965   // For each vectorized value:
2966   for (TreeEntry &EIdx : VectorizableTree) {
2967     TreeEntry *Entry = &EIdx;
2968 
2969     // For each lane:
2970     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2971       Value *Scalar = Entry->Scalars[Lane];
2972       // No need to handle users of gathered values.
2973       if (Entry->NeedToGather)
2974         continue;
2975 
2976       assert(Entry->VectorizedValue && "Can't find vectorizable value");
2977 
2978       Type *Ty = Scalar->getType();
2979       if (!Ty->isVoidTy()) {
2980 #ifndef NDEBUG
2981         for (User *U : Scalar->users()) {
2982           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2983 
2984           assert((ScalarToTreeEntry.count(U) ||
2985                   // It is legal to replace users in the ignorelist by undef.
2986                   is_contained(UserIgnoreList, U)) &&
2987                  "Replacing out-of-tree value with undef");
2988         }
2989 #endif
2990         Value *Undef = UndefValue::get(Ty);
2991         Scalar->replaceAllUsesWith(Undef);
2992       }
2993       DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2994       eraseInstruction(cast<Instruction>(Scalar));
2995     }
2996   }
2997 
2998   Builder.ClearInsertionPoint();
2999 
3000   return VectorizableTree[0].VectorizedValue;
3001 }
3002 
3003 void BoUpSLP::optimizeGatherSequence() {
3004   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
3005         << " gather sequences instructions.\n");
3006   // LICM InsertElementInst sequences.
3007   for (Instruction *it : GatherSeq) {
3008     InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
3009 
3010     if (!Insert)
3011       continue;
3012 
3013     // Check if this block is inside a loop.
3014     Loop *L = LI->getLoopFor(Insert->getParent());
3015     if (!L)
3016       continue;
3017 
3018     // Check if it has a preheader.
3019     BasicBlock *PreHeader = L->getLoopPreheader();
3020     if (!PreHeader)
3021       continue;
3022 
3023     // If the vector or the element that we insert into it are
3024     // instructions that are defined in this basic block then we can't
3025     // hoist this instruction.
3026     Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
3027     Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
3028     if (CurrVec && L->contains(CurrVec))
3029       continue;
3030     if (NewElem && L->contains(NewElem))
3031       continue;
3032 
3033     // We can hoist this instruction. Move it to the pre-header.
3034     Insert->moveBefore(PreHeader->getTerminator());
3035   }
3036 
3037   // Make a list of all reachable blocks in our CSE queue.
3038   SmallVector<const DomTreeNode *, 8> CSEWorkList;
3039   CSEWorkList.reserve(CSEBlocks.size());
3040   for (BasicBlock *BB : CSEBlocks)
3041     if (DomTreeNode *N = DT->getNode(BB)) {
3042       assert(DT->isReachableFromEntry(N));
3043       CSEWorkList.push_back(N);
3044     }
3045 
3046   // Sort blocks by domination. This ensures we visit a block after all blocks
3047   // dominating it are visited.
3048   std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
3049                    [this](const DomTreeNode *A, const DomTreeNode *B) {
3050     return DT->properlyDominates(A, B);
3051   });
3052 
3053   // Perform O(N^2) search over the gather sequences and merge identical
3054   // instructions. TODO: We can further optimize this scan if we split the
3055   // instructions into different buckets based on the insert lane.
3056   SmallVector<Instruction *, 16> Visited;
3057   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
3058     assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
3059            "Worklist not sorted properly!");
3060     BasicBlock *BB = (*I)->getBlock();
3061     // For all instructions in blocks containing gather sequences:
3062     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
3063       Instruction *In = &*it++;
3064       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
3065         continue;
3066 
3067       // Check if we can replace this instruction with any of the
3068       // visited instructions.
3069       for (Instruction *v : Visited) {
3070         if (In->isIdenticalTo(v) &&
3071             DT->dominates(v->getParent(), In->getParent())) {
3072           In->replaceAllUsesWith(v);
3073           eraseInstruction(In);
3074           In = nullptr;
3075           break;
3076         }
3077       }
3078       if (In) {
3079         assert(!is_contained(Visited, In));
3080         Visited.push_back(In);
3081       }
3082     }
3083   }
3084   CSEBlocks.clear();
3085   GatherSeq.clear();
3086 }
3087 
3088 // Groups the instructions to a bundle (which is then a single scheduling entity)
3089 // and schedules instructions until the bundle gets ready.
3090 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
3091                                                  BoUpSLP *SLP) {
3092   if (isa<PHINode>(VL[0]))
3093     return true;
3094 
3095   // Initialize the instruction bundle.
3096   Instruction *OldScheduleEnd = ScheduleEnd;
3097   ScheduleData *PrevInBundle = nullptr;
3098   ScheduleData *Bundle = nullptr;
3099   bool ReSchedule = false;
3100   DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
3101 
3102   // Make sure that the scheduling region contains all
3103   // instructions of the bundle.
3104   for (Value *V : VL) {
3105     if (!extendSchedulingRegion(V))
3106       return false;
3107   }
3108 
3109   for (Value *V : VL) {
3110     ScheduleData *BundleMember = getScheduleData(V);
3111     assert(BundleMember &&
3112            "no ScheduleData for bundle member (maybe not in same basic block)");
3113     if (BundleMember->IsScheduled) {
3114       // A bundle member was scheduled as single instruction before and now
3115       // needs to be scheduled as part of the bundle. We just get rid of the
3116       // existing schedule.
3117       DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
3118                    << " was already scheduled\n");
3119       ReSchedule = true;
3120     }
3121     assert(BundleMember->isSchedulingEntity() &&
3122            "bundle member already part of other bundle");
3123     if (PrevInBundle) {
3124       PrevInBundle->NextInBundle = BundleMember;
3125     } else {
3126       Bundle = BundleMember;
3127     }
3128     BundleMember->UnscheduledDepsInBundle = 0;
3129     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
3130 
3131     // Group the instructions to a bundle.
3132     BundleMember->FirstInBundle = Bundle;
3133     PrevInBundle = BundleMember;
3134   }
3135   if (ScheduleEnd != OldScheduleEnd) {
3136     // The scheduling region got new instructions at the lower end (or it is a
3137     // new region for the first bundle). This makes it necessary to
3138     // recalculate all dependencies.
3139     // It is seldom that this needs to be done a second time after adding the
3140     // initial bundle to the region.
3141     for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3142       ScheduleData *SD = getScheduleData(I);
3143       SD->clearDependencies();
3144     }
3145     ReSchedule = true;
3146   }
3147   if (ReSchedule) {
3148     resetSchedule();
3149     initialFillReadyList(ReadyInsts);
3150   }
3151 
3152   DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
3153                << BB->getName() << "\n");
3154 
3155   calculateDependencies(Bundle, true, SLP);
3156 
3157   // Now try to schedule the new bundle. As soon as the bundle is "ready" it
3158   // means that there are no cyclic dependencies and we can schedule it.
3159   // Note that's important that we don't "schedule" the bundle yet (see
3160   // cancelScheduling).
3161   while (!Bundle->isReady() && !ReadyInsts.empty()) {
3162 
3163     ScheduleData *pickedSD = ReadyInsts.back();
3164     ReadyInsts.pop_back();
3165 
3166     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
3167       schedule(pickedSD, ReadyInsts);
3168     }
3169   }
3170   if (!Bundle->isReady()) {
3171     cancelScheduling(VL);
3172     return false;
3173   }
3174   return true;
3175 }
3176 
3177 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
3178   if (isa<PHINode>(VL[0]))
3179     return;
3180 
3181   ScheduleData *Bundle = getScheduleData(VL[0]);
3182   DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
3183   assert(!Bundle->IsScheduled &&
3184          "Can't cancel bundle which is already scheduled");
3185   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
3186          "tried to unbundle something which is not a bundle");
3187 
3188   // Un-bundle: make single instructions out of the bundle.
3189   ScheduleData *BundleMember = Bundle;
3190   while (BundleMember) {
3191     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
3192     BundleMember->FirstInBundle = BundleMember;
3193     ScheduleData *Next = BundleMember->NextInBundle;
3194     BundleMember->NextInBundle = nullptr;
3195     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
3196     if (BundleMember->UnscheduledDepsInBundle == 0) {
3197       ReadyInsts.insert(BundleMember);
3198     }
3199     BundleMember = Next;
3200   }
3201 }
3202 
3203 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
3204   if (getScheduleData(V))
3205     return true;
3206   Instruction *I = dyn_cast<Instruction>(V);
3207   assert(I && "bundle member must be an instruction");
3208   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
3209   if (!ScheduleStart) {
3210     // It's the first instruction in the new region.
3211     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
3212     ScheduleStart = I;
3213     ScheduleEnd = I->getNextNode();
3214     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
3215     DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
3216     return true;
3217   }
3218   // Search up and down at the same time, because we don't know if the new
3219   // instruction is above or below the existing scheduling region.
3220   BasicBlock::reverse_iterator UpIter =
3221       ++ScheduleStart->getIterator().getReverse();
3222   BasicBlock::reverse_iterator UpperEnd = BB->rend();
3223   BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
3224   BasicBlock::iterator LowerEnd = BB->end();
3225   for (;;) {
3226     if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
3227       DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
3228       return false;
3229     }
3230 
3231     if (UpIter != UpperEnd) {
3232       if (&*UpIter == I) {
3233         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
3234         ScheduleStart = I;
3235         DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
3236         return true;
3237       }
3238       UpIter++;
3239     }
3240     if (DownIter != LowerEnd) {
3241       if (&*DownIter == I) {
3242         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
3243                          nullptr);
3244         ScheduleEnd = I->getNextNode();
3245         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
3246         DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
3247         return true;
3248       }
3249       DownIter++;
3250     }
3251     assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
3252            "instruction not found in block");
3253   }
3254   return true;
3255 }
3256 
3257 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
3258                                                 Instruction *ToI,
3259                                                 ScheduleData *PrevLoadStore,
3260                                                 ScheduleData *NextLoadStore) {
3261   ScheduleData *CurrentLoadStore = PrevLoadStore;
3262   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
3263     ScheduleData *SD = ScheduleDataMap[I];
3264     if (!SD) {
3265       // Allocate a new ScheduleData for the instruction.
3266       if (ChunkPos >= ChunkSize) {
3267         ScheduleDataChunks.push_back(
3268             llvm::make_unique<ScheduleData[]>(ChunkSize));
3269         ChunkPos = 0;
3270       }
3271       SD = &(ScheduleDataChunks.back()[ChunkPos++]);
3272       ScheduleDataMap[I] = SD;
3273       SD->Inst = I;
3274     }
3275     assert(!isInSchedulingRegion(SD) &&
3276            "new ScheduleData already in scheduling region");
3277     SD->init(SchedulingRegionID);
3278 
3279     if (I->mayReadOrWriteMemory()) {
3280       // Update the linked list of memory accessing instructions.
3281       if (CurrentLoadStore) {
3282         CurrentLoadStore->NextLoadStore = SD;
3283       } else {
3284         FirstLoadStoreInRegion = SD;
3285       }
3286       CurrentLoadStore = SD;
3287     }
3288   }
3289   if (NextLoadStore) {
3290     if (CurrentLoadStore)
3291       CurrentLoadStore->NextLoadStore = NextLoadStore;
3292   } else {
3293     LastLoadStoreInRegion = CurrentLoadStore;
3294   }
3295 }
3296 
3297 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
3298                                                      bool InsertInReadyList,
3299                                                      BoUpSLP *SLP) {
3300   assert(SD->isSchedulingEntity());
3301 
3302   SmallVector<ScheduleData *, 10> WorkList;
3303   WorkList.push_back(SD);
3304 
3305   while (!WorkList.empty()) {
3306     ScheduleData *SD = WorkList.back();
3307     WorkList.pop_back();
3308 
3309     ScheduleData *BundleMember = SD;
3310     while (BundleMember) {
3311       assert(isInSchedulingRegion(BundleMember));
3312       if (!BundleMember->hasValidDependencies()) {
3313 
3314         DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
3315         BundleMember->Dependencies = 0;
3316         BundleMember->resetUnscheduledDeps();
3317 
3318         // Handle def-use chain dependencies.
3319         for (User *U : BundleMember->Inst->users()) {
3320           if (isa<Instruction>(U)) {
3321             ScheduleData *UseSD = getScheduleData(U);
3322             if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3323               BundleMember->Dependencies++;
3324               ScheduleData *DestBundle = UseSD->FirstInBundle;
3325               if (!DestBundle->IsScheduled) {
3326                 BundleMember->incrementUnscheduledDeps(1);
3327               }
3328               if (!DestBundle->hasValidDependencies()) {
3329                 WorkList.push_back(DestBundle);
3330               }
3331             }
3332           } else {
3333             // I'm not sure if this can ever happen. But we need to be safe.
3334             // This lets the instruction/bundle never be scheduled and
3335             // eventually disable vectorization.
3336             BundleMember->Dependencies++;
3337             BundleMember->incrementUnscheduledDeps(1);
3338           }
3339         }
3340 
3341         // Handle the memory dependencies.
3342         ScheduleData *DepDest = BundleMember->NextLoadStore;
3343         if (DepDest) {
3344           Instruction *SrcInst = BundleMember->Inst;
3345           MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
3346           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
3347           unsigned numAliased = 0;
3348           unsigned DistToSrc = 1;
3349 
3350           while (DepDest) {
3351             assert(isInSchedulingRegion(DepDest));
3352 
3353             // We have two limits to reduce the complexity:
3354             // 1) AliasedCheckLimit: It's a small limit to reduce calls to
3355             //    SLP->isAliased (which is the expensive part in this loop).
3356             // 2) MaxMemDepDistance: It's for very large blocks and it aborts
3357             //    the whole loop (even if the loop is fast, it's quadratic).
3358             //    It's important for the loop break condition (see below) to
3359             //    check this limit even between two read-only instructions.
3360             if (DistToSrc >= MaxMemDepDistance ||
3361                     ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
3362                      (numAliased >= AliasedCheckLimit ||
3363                       SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
3364 
3365               // We increment the counter only if the locations are aliased
3366               // (instead of counting all alias checks). This gives a better
3367               // balance between reduced runtime and accurate dependencies.
3368               numAliased++;
3369 
3370               DepDest->MemoryDependencies.push_back(BundleMember);
3371               BundleMember->Dependencies++;
3372               ScheduleData *DestBundle = DepDest->FirstInBundle;
3373               if (!DestBundle->IsScheduled) {
3374                 BundleMember->incrementUnscheduledDeps(1);
3375               }
3376               if (!DestBundle->hasValidDependencies()) {
3377                 WorkList.push_back(DestBundle);
3378               }
3379             }
3380             DepDest = DepDest->NextLoadStore;
3381 
3382             // Example, explaining the loop break condition: Let's assume our
3383             // starting instruction is i0 and MaxMemDepDistance = 3.
3384             //
3385             //                      +--------v--v--v
3386             //             i0,i1,i2,i3,i4,i5,i6,i7,i8
3387             //             +--------^--^--^
3388             //
3389             // MaxMemDepDistance let us stop alias-checking at i3 and we add
3390             // dependencies from i0 to i3,i4,.. (even if they are not aliased).
3391             // Previously we already added dependencies from i3 to i6,i7,i8
3392             // (because of MaxMemDepDistance). As we added a dependency from
3393             // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
3394             // and we can abort this loop at i6.
3395             if (DistToSrc >= 2 * MaxMemDepDistance)
3396                 break;
3397             DistToSrc++;
3398           }
3399         }
3400       }
3401       BundleMember = BundleMember->NextInBundle;
3402     }
3403     if (InsertInReadyList && SD->isReady()) {
3404       ReadyInsts.push_back(SD);
3405       DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
3406     }
3407   }
3408 }
3409 
3410 void BoUpSLP::BlockScheduling::resetSchedule() {
3411   assert(ScheduleStart &&
3412          "tried to reset schedule on block which has not been scheduled");
3413   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3414     ScheduleData *SD = getScheduleData(I);
3415     assert(isInSchedulingRegion(SD));
3416     SD->IsScheduled = false;
3417     SD->resetUnscheduledDeps();
3418   }
3419   ReadyInsts.clear();
3420 }
3421 
3422 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
3423 
3424   if (!BS->ScheduleStart)
3425     return;
3426 
3427   DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
3428 
3429   BS->resetSchedule();
3430 
3431   // For the real scheduling we use a more sophisticated ready-list: it is
3432   // sorted by the original instruction location. This lets the final schedule
3433   // be as  close as possible to the original instruction order.
3434   struct ScheduleDataCompare {
3435     bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
3436       return SD2->SchedulingPriority < SD1->SchedulingPriority;
3437     }
3438   };
3439   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3440 
3441   // Ensure that all dependency data is updated and fill the ready-list with
3442   // initial instructions.
3443   int Idx = 0;
3444   int NumToSchedule = 0;
3445   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3446        I = I->getNextNode()) {
3447     ScheduleData *SD = BS->getScheduleData(I);
3448     assert(
3449         SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
3450         "scheduler and vectorizer have different opinion on what is a bundle");
3451     SD->FirstInBundle->SchedulingPriority = Idx++;
3452     if (SD->isSchedulingEntity()) {
3453       BS->calculateDependencies(SD, false, this);
3454       NumToSchedule++;
3455     }
3456   }
3457   BS->initialFillReadyList(ReadyInsts);
3458 
3459   Instruction *LastScheduledInst = BS->ScheduleEnd;
3460 
3461   // Do the "real" scheduling.
3462   while (!ReadyInsts.empty()) {
3463     ScheduleData *picked = *ReadyInsts.begin();
3464     ReadyInsts.erase(ReadyInsts.begin());
3465 
3466     // Move the scheduled instruction(s) to their dedicated places, if not
3467     // there yet.
3468     ScheduleData *BundleMember = picked;
3469     while (BundleMember) {
3470       Instruction *pickedInst = BundleMember->Inst;
3471       if (LastScheduledInst->getNextNode() != pickedInst) {
3472         BS->BB->getInstList().remove(pickedInst);
3473         BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
3474                                      pickedInst);
3475       }
3476       LastScheduledInst = pickedInst;
3477       BundleMember = BundleMember->NextInBundle;
3478     }
3479 
3480     BS->schedule(picked, ReadyInsts);
3481     NumToSchedule--;
3482   }
3483   assert(NumToSchedule == 0 && "could not schedule all instructions");
3484 
3485   // Avoid duplicate scheduling of the block.
3486   BS->ScheduleStart = nullptr;
3487 }
3488 
3489 unsigned BoUpSLP::getVectorElementSize(Value *V) {
3490   // If V is a store, just return the width of the stored value without
3491   // traversing the expression tree. This is the common case.
3492   if (auto *Store = dyn_cast<StoreInst>(V))
3493     return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
3494 
3495   // If V is not a store, we can traverse the expression tree to find loads
3496   // that feed it. The type of the loaded value may indicate a more suitable
3497   // width than V's type. We want to base the vector element size on the width
3498   // of memory operations where possible.
3499   SmallVector<Instruction *, 16> Worklist;
3500   SmallPtrSet<Instruction *, 16> Visited;
3501   if (auto *I = dyn_cast<Instruction>(V))
3502     Worklist.push_back(I);
3503 
3504   // Traverse the expression tree in bottom-up order looking for loads. If we
3505   // encounter an instruciton we don't yet handle, we give up.
3506   auto MaxWidth = 0u;
3507   auto FoundUnknownInst = false;
3508   while (!Worklist.empty() && !FoundUnknownInst) {
3509     auto *I = Worklist.pop_back_val();
3510     Visited.insert(I);
3511 
3512     // We should only be looking at scalar instructions here. If the current
3513     // instruction has a vector type, give up.
3514     auto *Ty = I->getType();
3515     if (isa<VectorType>(Ty))
3516       FoundUnknownInst = true;
3517 
3518     // If the current instruction is a load, update MaxWidth to reflect the
3519     // width of the loaded value.
3520     else if (isa<LoadInst>(I))
3521       MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
3522 
3523     // Otherwise, we need to visit the operands of the instruction. We only
3524     // handle the interesting cases from buildTree here. If an operand is an
3525     // instruction we haven't yet visited, we add it to the worklist.
3526     else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
3527              isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
3528       for (Use &U : I->operands())
3529         if (auto *J = dyn_cast<Instruction>(U.get()))
3530           if (!Visited.count(J))
3531             Worklist.push_back(J);
3532     }
3533 
3534     // If we don't yet handle the instruction, give up.
3535     else
3536       FoundUnknownInst = true;
3537   }
3538 
3539   // If we didn't encounter a memory access in the expression tree, or if we
3540   // gave up for some reason, just return the width of V.
3541   if (!MaxWidth || FoundUnknownInst)
3542     return DL->getTypeSizeInBits(V->getType());
3543 
3544   // Otherwise, return the maximum width we found.
3545   return MaxWidth;
3546 }
3547 
3548 // Determine if a value V in a vectorizable expression Expr can be demoted to a
3549 // smaller type with a truncation. We collect the values that will be demoted
3550 // in ToDemote and additional roots that require investigating in Roots.
3551 static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
3552                                   SmallVectorImpl<Value *> &ToDemote,
3553                                   SmallVectorImpl<Value *> &Roots) {
3554 
3555   // We can always demote constants.
3556   if (isa<Constant>(V)) {
3557     ToDemote.push_back(V);
3558     return true;
3559   }
3560 
3561   // If the value is not an instruction in the expression with only one use, it
3562   // cannot be demoted.
3563   auto *I = dyn_cast<Instruction>(V);
3564   if (!I || !I->hasOneUse() || !Expr.count(I))
3565     return false;
3566 
3567   switch (I->getOpcode()) {
3568 
3569   // We can always demote truncations and extensions. Since truncations can
3570   // seed additional demotion, we save the truncated value.
3571   case Instruction::Trunc:
3572     Roots.push_back(I->getOperand(0));
3573   case Instruction::ZExt:
3574   case Instruction::SExt:
3575     break;
3576 
3577   // We can demote certain binary operations if we can demote both of their
3578   // operands.
3579   case Instruction::Add:
3580   case Instruction::Sub:
3581   case Instruction::Mul:
3582   case Instruction::And:
3583   case Instruction::Or:
3584   case Instruction::Xor:
3585     if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
3586         !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
3587       return false;
3588     break;
3589 
3590   // We can demote selects if we can demote their true and false values.
3591   case Instruction::Select: {
3592     SelectInst *SI = cast<SelectInst>(I);
3593     if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
3594         !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
3595       return false;
3596     break;
3597   }
3598 
3599   // We can demote phis if we can demote all their incoming operands. Note that
3600   // we don't need to worry about cycles since we ensure single use above.
3601   case Instruction::PHI: {
3602     PHINode *PN = cast<PHINode>(I);
3603     for (Value *IncValue : PN->incoming_values())
3604       if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
3605         return false;
3606     break;
3607   }
3608 
3609   // Otherwise, conservatively give up.
3610   default:
3611     return false;
3612   }
3613 
3614   // Record the value that we can demote.
3615   ToDemote.push_back(V);
3616   return true;
3617 }
3618 
3619 void BoUpSLP::computeMinimumValueSizes() {
3620   // If there are no external uses, the expression tree must be rooted by a
3621   // store. We can't demote in-memory values, so there is nothing to do here.
3622   if (ExternalUses.empty())
3623     return;
3624 
3625   // We only attempt to truncate integer expressions.
3626   auto &TreeRoot = VectorizableTree[0].Scalars;
3627   auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
3628   if (!TreeRootIT)
3629     return;
3630 
3631   // If the expression is not rooted by a store, these roots should have
3632   // external uses. We will rely on InstCombine to rewrite the expression in
3633   // the narrower type. However, InstCombine only rewrites single-use values.
3634   // This means that if a tree entry other than a root is used externally, it
3635   // must have multiple uses and InstCombine will not rewrite it. The code
3636   // below ensures that only the roots are used externally.
3637   SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
3638   for (auto &EU : ExternalUses)
3639     if (!Expr.erase(EU.Scalar))
3640       return;
3641   if (!Expr.empty())
3642     return;
3643 
3644   // Collect the scalar values of the vectorizable expression. We will use this
3645   // context to determine which values can be demoted. If we see a truncation,
3646   // we mark it as seeding another demotion.
3647   for (auto &Entry : VectorizableTree)
3648     Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
3649 
3650   // Ensure the roots of the vectorizable tree don't form a cycle. They must
3651   // have a single external user that is not in the vectorizable tree.
3652   for (auto *Root : TreeRoot)
3653     if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
3654       return;
3655 
3656   // Conservatively determine if we can actually truncate the roots of the
3657   // expression. Collect the values that can be demoted in ToDemote and
3658   // additional roots that require investigating in Roots.
3659   SmallVector<Value *, 32> ToDemote;
3660   SmallVector<Value *, 4> Roots;
3661   for (auto *Root : TreeRoot)
3662     if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
3663       return;
3664 
3665   // The maximum bit width required to represent all the values that can be
3666   // demoted without loss of precision. It would be safe to truncate the roots
3667   // of the expression to this width.
3668   auto MaxBitWidth = 8u;
3669 
3670   // We first check if all the bits of the roots are demanded. If they're not,
3671   // we can truncate the roots to this narrower type.
3672   for (auto *Root : TreeRoot) {
3673     auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
3674     MaxBitWidth = std::max<unsigned>(
3675         Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
3676   }
3677 
3678   // True if the roots can be zero-extended back to their original type, rather
3679   // than sign-extended. We know that if the leading bits are not demanded, we
3680   // can safely zero-extend. So we initialize IsKnownPositive to True.
3681   bool IsKnownPositive = true;
3682 
3683   // If all the bits of the roots are demanded, we can try a little harder to
3684   // compute a narrower type. This can happen, for example, if the roots are
3685   // getelementptr indices. InstCombine promotes these indices to the pointer
3686   // width. Thus, all their bits are technically demanded even though the
3687   // address computation might be vectorized in a smaller type.
3688   //
3689   // We start by looking at each entry that can be demoted. We compute the
3690   // maximum bit width required to store the scalar by using ValueTracking to
3691   // compute the number of high-order bits we can truncate.
3692   if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
3693     MaxBitWidth = 8u;
3694 
3695     // Determine if the sign bit of all the roots is known to be zero. If not,
3696     // IsKnownPositive is set to False.
3697     IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
3698       bool KnownZero = false;
3699       bool KnownOne = false;
3700       ComputeSignBit(R, KnownZero, KnownOne, *DL);
3701       return KnownZero;
3702     });
3703 
3704     // Determine the maximum number of bits required to store the scalar
3705     // values.
3706     for (auto *Scalar : ToDemote) {
3707       auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
3708       auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
3709       MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
3710     }
3711 
3712     // If we can't prove that the sign bit is zero, we must add one to the
3713     // maximum bit width to account for the unknown sign bit. This preserves
3714     // the existing sign bit so we can safely sign-extend the root back to the
3715     // original type. Otherwise, if we know the sign bit is zero, we will
3716     // zero-extend the root instead.
3717     //
3718     // FIXME: This is somewhat suboptimal, as there will be cases where adding
3719     //        one to the maximum bit width will yield a larger-than-necessary
3720     //        type. In general, we need to add an extra bit only if we can't
3721     //        prove that the upper bit of the original type is equal to the
3722     //        upper bit of the proposed smaller type. If these two bits are the
3723     //        same (either zero or one) we know that sign-extending from the
3724     //        smaller type will result in the same value. Here, since we can't
3725     //        yet prove this, we are just making the proposed smaller type
3726     //        larger to ensure correctness.
3727     if (!IsKnownPositive)
3728       ++MaxBitWidth;
3729   }
3730 
3731   // Round MaxBitWidth up to the next power-of-two.
3732   if (!isPowerOf2_64(MaxBitWidth))
3733     MaxBitWidth = NextPowerOf2(MaxBitWidth);
3734 
3735   // If the maximum bit width we compute is less than the with of the roots'
3736   // type, we can proceed with the narrowing. Otherwise, do nothing.
3737   if (MaxBitWidth >= TreeRootIT->getBitWidth())
3738     return;
3739 
3740   // If we can truncate the root, we must collect additional values that might
3741   // be demoted as a result. That is, those seeded by truncations we will
3742   // modify.
3743   while (!Roots.empty())
3744     collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
3745 
3746   // Finally, map the values we can demote to the maximum bit with we computed.
3747   for (auto *Scalar : ToDemote)
3748     MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
3749 }
3750 
3751 namespace {
3752 /// The SLPVectorizer Pass.
3753 struct SLPVectorizer : public FunctionPass {
3754   SLPVectorizerPass Impl;
3755 
3756   /// Pass identification, replacement for typeid
3757   static char ID;
3758 
3759   explicit SLPVectorizer() : FunctionPass(ID) {
3760     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
3761   }
3762 
3763 
3764   bool doInitialization(Module &M) override {
3765     return false;
3766   }
3767 
3768   bool runOnFunction(Function &F) override {
3769     if (skipFunction(F))
3770       return false;
3771 
3772     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
3773     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
3774     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
3775     auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
3776     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3777     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
3778     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3779     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3780     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
3781     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
3782 
3783     return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
3784   }
3785 
3786   void getAnalysisUsage(AnalysisUsage &AU) const override {
3787     FunctionPass::getAnalysisUsage(AU);
3788     AU.addRequired<AssumptionCacheTracker>();
3789     AU.addRequired<ScalarEvolutionWrapperPass>();
3790     AU.addRequired<AAResultsWrapperPass>();
3791     AU.addRequired<TargetTransformInfoWrapperPass>();
3792     AU.addRequired<LoopInfoWrapperPass>();
3793     AU.addRequired<DominatorTreeWrapperPass>();
3794     AU.addRequired<DemandedBitsWrapperPass>();
3795     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
3796     AU.addPreserved<LoopInfoWrapperPass>();
3797     AU.addPreserved<DominatorTreeWrapperPass>();
3798     AU.addPreserved<AAResultsWrapperPass>();
3799     AU.addPreserved<GlobalsAAWrapperPass>();
3800     AU.setPreservesCFG();
3801   }
3802 };
3803 } // end anonymous namespace
3804 
3805 PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
3806   auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
3807   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
3808   auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
3809   auto *AA = &AM.getResult<AAManager>(F);
3810   auto *LI = &AM.getResult<LoopAnalysis>(F);
3811   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
3812   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
3813   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
3814   auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
3815 
3816   bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
3817   if (!Changed)
3818     return PreservedAnalyses::all();
3819 
3820   PreservedAnalyses PA;
3821   PA.preserveSet<CFGAnalyses>();
3822   PA.preserve<AAManager>();
3823   PA.preserve<GlobalsAA>();
3824   return PA;
3825 }
3826 
3827 bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
3828                                 TargetTransformInfo *TTI_,
3829                                 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
3830                                 LoopInfo *LI_, DominatorTree *DT_,
3831                                 AssumptionCache *AC_, DemandedBits *DB_,
3832                                 OptimizationRemarkEmitter *ORE_) {
3833   SE = SE_;
3834   TTI = TTI_;
3835   TLI = TLI_;
3836   AA = AA_;
3837   LI = LI_;
3838   DT = DT_;
3839   AC = AC_;
3840   DB = DB_;
3841   DL = &F.getParent()->getDataLayout();
3842 
3843   Stores.clear();
3844   GEPs.clear();
3845   bool Changed = false;
3846 
3847   // If the target claims to have no vector registers don't attempt
3848   // vectorization.
3849   if (!TTI->getNumberOfRegisters(true))
3850     return false;
3851 
3852   // Don't vectorize when the attribute NoImplicitFloat is used.
3853   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
3854     return false;
3855 
3856   DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
3857 
3858   // Use the bottom up slp vectorizer to construct chains that start with
3859   // store instructions.
3860   BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
3861 
3862   // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
3863   // delete instructions.
3864 
3865   // Scan the blocks in the function in post order.
3866   for (auto BB : post_order(&F.getEntryBlock())) {
3867     collectSeedInstructions(BB);
3868 
3869     // Vectorize trees that end at stores.
3870     if (!Stores.empty()) {
3871       DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
3872                    << " underlying objects.\n");
3873       Changed |= vectorizeStoreChains(R);
3874     }
3875 
3876     // Vectorize trees that end at reductions.
3877     Changed |= vectorizeChainsInBlock(BB, R);
3878 
3879     // Vectorize the index computations of getelementptr instructions. This
3880     // is primarily intended to catch gather-like idioms ending at
3881     // non-consecutive loads.
3882     if (!GEPs.empty()) {
3883       DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
3884                    << " underlying objects.\n");
3885       Changed |= vectorizeGEPIndices(BB, R);
3886     }
3887   }
3888 
3889   if (Changed) {
3890     R.optimizeGatherSequence();
3891     DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
3892     DEBUG(verifyFunction(F));
3893   }
3894   return Changed;
3895 }
3896 
3897 /// \brief Check that the Values in the slice in VL array are still existent in
3898 /// the WeakTrackingVH array.
3899 /// Vectorization of part of the VL array may cause later values in the VL array
3900 /// to become invalid. We track when this has happened in the WeakTrackingVH
3901 /// array.
3902 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
3903                                ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
3904                                unsigned SliceSize) {
3905   VL = VL.slice(SliceBegin, SliceSize);
3906   VH = VH.slice(SliceBegin, SliceSize);
3907   return !std::equal(VL.begin(), VL.end(), VH.begin());
3908 }
3909 
3910 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
3911                                             unsigned VecRegSize) {
3912   unsigned ChainLen = Chain.size();
3913   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
3914         << "\n");
3915   unsigned Sz = R.getVectorElementSize(Chain[0]);
3916   unsigned VF = VecRegSize / Sz;
3917 
3918   if (!isPowerOf2_32(Sz) || VF < 2)
3919     return false;
3920 
3921   // Keep track of values that were deleted by vectorizing in the loop below.
3922   SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
3923 
3924   bool Changed = false;
3925   // Look for profitable vectorizable trees at all offsets, starting at zero.
3926   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
3927     if (i + VF > e)
3928       break;
3929 
3930     // Check that a previous iteration of this loop did not delete the Value.
3931     if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3932       continue;
3933 
3934     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
3935           << "\n");
3936     ArrayRef<Value *> Operands = Chain.slice(i, VF);
3937 
3938     R.buildTree(Operands);
3939     if (R.isTreeTinyAndNotFullyVectorizable())
3940       continue;
3941 
3942     R.computeMinimumValueSizes();
3943 
3944     int Cost = R.getTreeCost();
3945 
3946     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
3947     if (Cost < -SLPCostThreshold) {
3948       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3949       using namespace ore;
3950       R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
3951                                           cast<StoreInst>(Chain[i]))
3952                        << "Stores SLP vectorized with cost " << NV("Cost", Cost)
3953                        << " and with tree size "
3954                        << NV("TreeSize", R.getTreeSize()));
3955 
3956       R.vectorizeTree();
3957 
3958       // Move to the next bundle.
3959       i += VF - 1;
3960       Changed = true;
3961     }
3962   }
3963 
3964   return Changed;
3965 }
3966 
3967 bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
3968                                         BoUpSLP &R) {
3969   SetVector<StoreInst *> Heads, Tails;
3970   SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
3971 
3972   // We may run into multiple chains that merge into a single chain. We mark the
3973   // stores that we vectorized so that we don't visit the same store twice.
3974   BoUpSLP::ValueSet VectorizedStores;
3975   bool Changed = false;
3976 
3977   // Do a quadratic search on all of the given stores and find
3978   // all of the pairs of stores that follow each other.
3979   SmallVector<unsigned, 16> IndexQueue;
3980   for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3981     IndexQueue.clear();
3982     // If a store has multiple consecutive store candidates, search Stores
3983     // array according to the sequence: from i+1 to e, then from i-1 to 0.
3984     // This is because usually pairing with immediate succeeding or preceding
3985     // candidate create the best chance to find slp vectorization opportunity.
3986     unsigned j = 0;
3987     for (j = i + 1; j < e; ++j)
3988       IndexQueue.push_back(j);
3989     for (j = i; j > 0; --j)
3990       IndexQueue.push_back(j - 1);
3991 
3992     for (auto &k : IndexQueue) {
3993       if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
3994         Tails.insert(Stores[k]);
3995         Heads.insert(Stores[i]);
3996         ConsecutiveChain[Stores[i]] = Stores[k];
3997         break;
3998       }
3999     }
4000   }
4001 
4002   // For stores that start but don't end a link in the chain:
4003   for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
4004        it != e; ++it) {
4005     if (Tails.count(*it))
4006       continue;
4007 
4008     // We found a store instr that starts a chain. Now follow the chain and try
4009     // to vectorize it.
4010     BoUpSLP::ValueList Operands;
4011     StoreInst *I = *it;
4012     // Collect the chain into a list.
4013     while (Tails.count(I) || Heads.count(I)) {
4014       if (VectorizedStores.count(I))
4015         break;
4016       Operands.push_back(I);
4017       // Move to the next value in the chain.
4018       I = ConsecutiveChain[I];
4019     }
4020 
4021     // FIXME: Is division-by-2 the correct step? Should we assert that the
4022     // register size is a power-of-2?
4023     for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
4024          Size /= 2) {
4025       if (vectorizeStoreChain(Operands, R, Size)) {
4026         // Mark the vectorized stores so that we don't vectorize them again.
4027         VectorizedStores.insert(Operands.begin(), Operands.end());
4028         Changed = true;
4029         break;
4030       }
4031     }
4032   }
4033 
4034   return Changed;
4035 }
4036 
4037 void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
4038 
4039   // Initialize the collections. We will make a single pass over the block.
4040   Stores.clear();
4041   GEPs.clear();
4042 
4043   // Visit the store and getelementptr instructions in BB and organize them in
4044   // Stores and GEPs according to the underlying objects of their pointer
4045   // operands.
4046   for (Instruction &I : *BB) {
4047 
4048     // Ignore store instructions that are volatile or have a pointer operand
4049     // that doesn't point to a scalar type.
4050     if (auto *SI = dyn_cast<StoreInst>(&I)) {
4051       if (!SI->isSimple())
4052         continue;
4053       if (!isValidElementType(SI->getValueOperand()->getType()))
4054         continue;
4055       Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
4056     }
4057 
4058     // Ignore getelementptr instructions that have more than one index, a
4059     // constant index, or a pointer operand that doesn't point to a scalar
4060     // type.
4061     else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
4062       auto Idx = GEP->idx_begin()->get();
4063       if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
4064         continue;
4065       if (!isValidElementType(Idx->getType()))
4066         continue;
4067       if (GEP->getType()->isVectorTy())
4068         continue;
4069       GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
4070     }
4071   }
4072 }
4073 
4074 bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
4075   if (!A || !B)
4076     return false;
4077   Value *VL[] = { A, B };
4078   return tryToVectorizeList(VL, R, None, true);
4079 }
4080 
4081 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
4082                                            ArrayRef<Value *> BuildVector,
4083                                            bool AllowReorder) {
4084   if (VL.size() < 2)
4085     return false;
4086 
4087   DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
4088                << ".\n");
4089 
4090   // Check that all of the parts are scalar instructions of the same type.
4091   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
4092   if (!I0)
4093     return false;
4094 
4095   unsigned Opcode0 = I0->getOpcode();
4096 
4097   unsigned Sz = R.getVectorElementSize(I0);
4098   unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
4099   unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
4100   if (MaxVF < 2)
4101     return false;
4102 
4103   for (Value *V : VL) {
4104     Type *Ty = V->getType();
4105     if (!isValidElementType(Ty))
4106       return false;
4107     Instruction *Inst = dyn_cast<Instruction>(V);
4108     if (!Inst || Inst->getOpcode() != Opcode0)
4109       return false;
4110   }
4111 
4112   bool Changed = false;
4113 
4114   // Keep track of values that were deleted by vectorizing in the loop below.
4115   SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
4116 
4117   unsigned NextInst = 0, MaxInst = VL.size();
4118   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
4119        VF /= 2) {
4120     // No actual vectorization should happen, if number of parts is the same as
4121     // provided vectorization factor (i.e. the scalar type is used for vector
4122     // code during codegen).
4123     auto *VecTy = VectorType::get(VL[0]->getType(), VF);
4124     if (TTI->getNumberOfParts(VecTy) == VF)
4125       continue;
4126     for (unsigned I = NextInst; I < MaxInst; ++I) {
4127       unsigned OpsWidth = 0;
4128 
4129       if (I + VF > MaxInst)
4130         OpsWidth = MaxInst - I;
4131       else
4132         OpsWidth = VF;
4133 
4134       if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
4135         break;
4136 
4137       // Check that a previous iteration of this loop did not delete the Value.
4138       if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
4139         continue;
4140 
4141       DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
4142                    << "\n");
4143       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
4144 
4145       ArrayRef<Value *> BuildVectorSlice;
4146       if (!BuildVector.empty())
4147         BuildVectorSlice = BuildVector.slice(I, OpsWidth);
4148 
4149       R.buildTree(Ops, BuildVectorSlice);
4150       // TODO: check if we can allow reordering for more cases.
4151       if (AllowReorder && R.shouldReorder()) {
4152         // Conceptually, there is nothing actually preventing us from trying to
4153         // reorder a larger list. In fact, we do exactly this when vectorizing
4154         // reductions. However, at this point, we only expect to get here when
4155         // there are exactly two operations.
4156         assert(Ops.size() == 2);
4157         assert(BuildVectorSlice.empty());
4158         Value *ReorderedOps[] = {Ops[1], Ops[0]};
4159         R.buildTree(ReorderedOps, None);
4160       }
4161       if (R.isTreeTinyAndNotFullyVectorizable())
4162         continue;
4163 
4164       R.computeMinimumValueSizes();
4165       int Cost = R.getTreeCost();
4166 
4167       if (Cost < -SLPCostThreshold) {
4168         DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
4169         R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
4170                                             cast<Instruction>(Ops[0]))
4171                          << "SLP vectorized with cost " << ore::NV("Cost", Cost)
4172                          << " and with tree size "
4173                          << ore::NV("TreeSize", R.getTreeSize()));
4174 
4175         Value *VectorizedRoot = R.vectorizeTree();
4176 
4177         // Reconstruct the build vector by extracting the vectorized root. This
4178         // way we handle the case where some elements of the vector are
4179         // undefined.
4180         //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
4181         if (!BuildVectorSlice.empty()) {
4182           // The insert point is the last build vector instruction. The
4183           // vectorized root will precede it. This guarantees that we get an
4184           // instruction. The vectorized tree could have been constant folded.
4185           Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
4186           unsigned VecIdx = 0;
4187           for (auto &V : BuildVectorSlice) {
4188             IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
4189                                         ++BasicBlock::iterator(InsertAfter));
4190             Instruction *I = cast<Instruction>(V);
4191             assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
4192             Instruction *Extract =
4193                 cast<Instruction>(Builder.CreateExtractElement(
4194                     VectorizedRoot, Builder.getInt32(VecIdx++)));
4195             I->setOperand(1, Extract);
4196             I->removeFromParent();
4197             I->insertAfter(Extract);
4198             InsertAfter = I;
4199           }
4200         }
4201         // Move to the next bundle.
4202         I += VF - 1;
4203         NextInst = I + 1;
4204         Changed = true;
4205       }
4206     }
4207   }
4208 
4209   return Changed;
4210 }
4211 
4212 bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
4213   if (!V)
4214     return false;
4215 
4216   Value *P = V->getParent();
4217 
4218   // Vectorize in current basic block only.
4219   auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
4220   auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
4221   if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
4222     return false;
4223 
4224   // Try to vectorize V.
4225   if (tryToVectorizePair(Op0, Op1, R))
4226     return true;
4227 
4228   auto *A = dyn_cast<BinaryOperator>(Op0);
4229   auto *B = dyn_cast<BinaryOperator>(Op1);
4230   // Try to skip B.
4231   if (B && B->hasOneUse()) {
4232     auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
4233     auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
4234     if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
4235       return true;
4236     if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
4237       return true;
4238   }
4239 
4240   // Try to skip A.
4241   if (A && A->hasOneUse()) {
4242     auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
4243     auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
4244     if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
4245       return true;
4246     if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
4247       return true;
4248   }
4249   return false;
4250 }
4251 
4252 /// \brief Generate a shuffle mask to be used in a reduction tree.
4253 ///
4254 /// \param VecLen The length of the vector to be reduced.
4255 /// \param NumEltsToRdx The number of elements that should be reduced in the
4256 ///        vector.
4257 /// \param IsPairwise Whether the reduction is a pairwise or splitting
4258 ///        reduction. A pairwise reduction will generate a mask of
4259 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
4260 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
4261 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
4262 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
4263                                    bool IsPairwise, bool IsLeft,
4264                                    IRBuilder<> &Builder) {
4265   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
4266 
4267   SmallVector<Constant *, 32> ShuffleMask(
4268       VecLen, UndefValue::get(Builder.getInt32Ty()));
4269 
4270   if (IsPairwise)
4271     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
4272     for (unsigned i = 0; i != NumEltsToRdx; ++i)
4273       ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
4274   else
4275     // Move the upper half of the vector to the lower half.
4276     for (unsigned i = 0; i != NumEltsToRdx; ++i)
4277       ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
4278 
4279   return ConstantVector::get(ShuffleMask);
4280 }
4281 
4282 namespace {
4283 /// Model horizontal reductions.
4284 ///
4285 /// A horizontal reduction is a tree of reduction operations (currently add and
4286 /// fadd) that has operations that can be put into a vector as its leaf.
4287 /// For example, this tree:
4288 ///
4289 /// mul mul mul mul
4290 ///  \  /    \  /
4291 ///   +       +
4292 ///    \     /
4293 ///       +
4294 /// This tree has "mul" as its reduced values and "+" as its reduction
4295 /// operations. A reduction might be feeding into a store or a binary operation
4296 /// feeding a phi.
4297 ///    ...
4298 ///    \  /
4299 ///     +
4300 ///     |
4301 ///  phi +=
4302 ///
4303 ///  Or:
4304 ///    ...
4305 ///    \  /
4306 ///     +
4307 ///     |
4308 ///   *p =
4309 ///
4310 class HorizontalReduction {
4311   SmallVector<Value *, 16> ReductionOps;
4312   SmallVector<Value *, 32> ReducedVals;
4313   // Use map vector to make stable output.
4314   MapVector<Instruction *, Value *> ExtraArgs;
4315 
4316   BinaryOperator *ReductionRoot = nullptr;
4317 
4318   /// The opcode of the reduction.
4319   Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
4320   /// The opcode of the values we perform a reduction on.
4321   unsigned ReducedValueOpcode = 0;
4322   /// Should we model this reduction as a pairwise reduction tree or a tree that
4323   /// splits the vector in halves and adds those halves.
4324   bool IsPairwiseReduction = false;
4325 
4326   /// Checks if the ParentStackElem.first should be marked as a reduction
4327   /// operation with an extra argument or as extra argument itself.
4328   void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
4329                     Value *ExtraArg) {
4330     if (ExtraArgs.count(ParentStackElem.first)) {
4331       ExtraArgs[ParentStackElem.first] = nullptr;
4332       // We ran into something like:
4333       // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
4334       // The whole ParentStackElem.first should be considered as an extra value
4335       // in this case.
4336       // Do not perform analysis of remaining operands of ParentStackElem.first
4337       // instruction, this whole instruction is an extra argument.
4338       ParentStackElem.second = ParentStackElem.first->getNumOperands();
4339     } else {
4340       // We ran into something like:
4341       // ParentStackElem.first += ... + ExtraArg + ...
4342       ExtraArgs[ParentStackElem.first] = ExtraArg;
4343     }
4344   }
4345 
4346 public:
4347   HorizontalReduction() = default;
4348 
4349   /// \brief Try to find a reduction tree.
4350   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
4351     assert((!Phi || is_contained(Phi->operands(), B)) &&
4352            "Thi phi needs to use the binary operator");
4353 
4354     // We could have a initial reductions that is not an add.
4355     //  r *= v1 + v2 + v3 + v4
4356     // In such a case start looking for a tree rooted in the first '+'.
4357     if (Phi) {
4358       if (B->getOperand(0) == Phi) {
4359         Phi = nullptr;
4360         B = dyn_cast<BinaryOperator>(B->getOperand(1));
4361       } else if (B->getOperand(1) == Phi) {
4362         Phi = nullptr;
4363         B = dyn_cast<BinaryOperator>(B->getOperand(0));
4364       }
4365     }
4366 
4367     if (!B)
4368       return false;
4369 
4370     Type *Ty = B->getType();
4371     if (!isValidElementType(Ty))
4372       return false;
4373 
4374     ReductionOpcode = B->getOpcode();
4375     ReducedValueOpcode = 0;
4376     ReductionRoot = B;
4377 
4378     // We currently only support adds.
4379     if ((ReductionOpcode != Instruction::Add &&
4380          ReductionOpcode != Instruction::FAdd) ||
4381         !B->isAssociative())
4382       return false;
4383 
4384     // Post order traverse the reduction tree starting at B. We only handle true
4385     // trees containing only binary operators or selects.
4386     SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
4387     Stack.push_back(std::make_pair(B, 0));
4388     while (!Stack.empty()) {
4389       Instruction *TreeN = Stack.back().first;
4390       unsigned EdgeToVist = Stack.back().second++;
4391       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
4392 
4393       // Postorder vist.
4394       if (EdgeToVist == 2 || IsReducedValue) {
4395         if (IsReducedValue)
4396           ReducedVals.push_back(TreeN);
4397         else {
4398           auto I = ExtraArgs.find(TreeN);
4399           if (I != ExtraArgs.end() && !I->second) {
4400             // Check if TreeN is an extra argument of its parent operation.
4401             if (Stack.size() <= 1) {
4402               // TreeN can't be an extra argument as it is a root reduction
4403               // operation.
4404               return false;
4405             }
4406             // Yes, TreeN is an extra argument, do not add it to a list of
4407             // reduction operations.
4408             // Stack[Stack.size() - 2] always points to the parent operation.
4409             markExtraArg(Stack[Stack.size() - 2], TreeN);
4410             ExtraArgs.erase(TreeN);
4411           } else
4412             ReductionOps.push_back(TreeN);
4413         }
4414         // Retract.
4415         Stack.pop_back();
4416         continue;
4417       }
4418 
4419       // Visit left or right.
4420       Value *NextV = TreeN->getOperand(EdgeToVist);
4421       if (NextV != Phi) {
4422         auto *I = dyn_cast<Instruction>(NextV);
4423         // Continue analysis if the next operand is a reduction operation or
4424         // (possibly) a reduced value. If the reduced value opcode is not set,
4425         // the first met operation != reduction operation is considered as the
4426         // reduced value class.
4427         if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
4428                   I->getOpcode() == ReductionOpcode)) {
4429           // Only handle trees in the current basic block.
4430           if (I->getParent() != B->getParent()) {
4431             // I is an extra argument for TreeN (its parent operation).
4432             markExtraArg(Stack.back(), I);
4433             continue;
4434           }
4435 
4436           // Each tree node needs to have one user except for the ultimate
4437           // reduction.
4438           if (!I->hasOneUse() && I != B) {
4439             // I is an extra argument for TreeN (its parent operation).
4440             markExtraArg(Stack.back(), I);
4441             continue;
4442           }
4443 
4444           if (I->getOpcode() == ReductionOpcode) {
4445             // We need to be able to reassociate the reduction operations.
4446             if (!I->isAssociative()) {
4447               // I is an extra argument for TreeN (its parent operation).
4448               markExtraArg(Stack.back(), I);
4449               continue;
4450             }
4451           } else if (ReducedValueOpcode &&
4452                      ReducedValueOpcode != I->getOpcode()) {
4453             // Make sure that the opcodes of the operations that we are going to
4454             // reduce match.
4455             // I is an extra argument for TreeN (its parent operation).
4456             markExtraArg(Stack.back(), I);
4457             continue;
4458           } else if (!ReducedValueOpcode)
4459             ReducedValueOpcode = I->getOpcode();
4460 
4461           Stack.push_back(std::make_pair(I, 0));
4462           continue;
4463         }
4464       }
4465       // NextV is an extra argument for TreeN (its parent operation).
4466       markExtraArg(Stack.back(), NextV);
4467     }
4468     return true;
4469   }
4470 
4471   /// \brief Attempt to vectorize the tree found by
4472   /// matchAssociativeReduction.
4473   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
4474     if (ReducedVals.empty())
4475       return false;
4476 
4477     // If there is a sufficient number of reduction values, reduce
4478     // to a nearby power-of-2. Can safely generate oversized
4479     // vectors and rely on the backend to split them to legal sizes.
4480     unsigned NumReducedVals = ReducedVals.size();
4481     if (NumReducedVals < 4)
4482       return false;
4483 
4484     unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
4485 
4486     Value *VectorizedTree = nullptr;
4487     IRBuilder<> Builder(ReductionRoot);
4488     FastMathFlags Unsafe;
4489     Unsafe.setUnsafeAlgebra();
4490     Builder.setFastMathFlags(Unsafe);
4491     unsigned i = 0;
4492 
4493     BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
4494     // The same extra argument may be used several time, so log each attempt
4495     // to use it.
4496     for (auto &Pair : ExtraArgs)
4497       ExternallyUsedValues[Pair.second].push_back(Pair.first);
4498     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
4499       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
4500       V.buildTree(VL, ExternallyUsedValues, ReductionOps);
4501       if (V.shouldReorder()) {
4502         SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
4503         V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
4504       }
4505       if (V.isTreeTinyAndNotFullyVectorizable())
4506         break;
4507 
4508       V.computeMinimumValueSizes();
4509 
4510       // Estimate cost.
4511       int Cost =
4512           V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
4513       if (Cost >= -SLPCostThreshold)
4514         break;
4515 
4516       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
4517                    << ". (HorRdx)\n");
4518       auto *I0 = cast<Instruction>(VL[0]);
4519       V.getORE()->emit(
4520           OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
4521           << "Vectorized horizontal reduction with cost "
4522           << ore::NV("Cost", Cost) << " and with tree size "
4523           << ore::NV("TreeSize", V.getTreeSize()));
4524 
4525       // Vectorize a tree.
4526       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
4527       Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
4528 
4529       // Emit a reduction.
4530       Value *ReducedSubTree =
4531           emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
4532       if (VectorizedTree) {
4533         Builder.SetCurrentDebugLocation(Loc);
4534         VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
4535                                              ReducedSubTree, "bin.rdx");
4536         propagateIRFlags(VectorizedTree, ReductionOps);
4537       } else
4538         VectorizedTree = ReducedSubTree;
4539       i += ReduxWidth;
4540       ReduxWidth = PowerOf2Floor(NumReducedVals - i);
4541     }
4542 
4543     if (VectorizedTree) {
4544       // Finish the reduction.
4545       for (; i < NumReducedVals; ++i) {
4546         auto *I = cast<Instruction>(ReducedVals[i]);
4547         Builder.SetCurrentDebugLocation(I->getDebugLoc());
4548         VectorizedTree =
4549             Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
4550         propagateIRFlags(VectorizedTree, ReductionOps);
4551       }
4552       for (auto &Pair : ExternallyUsedValues) {
4553         assert(!Pair.second.empty() &&
4554                "At least one DebugLoc must be inserted");
4555         // Add each externally used value to the final reduction.
4556         for (auto *I : Pair.second) {
4557           Builder.SetCurrentDebugLocation(I->getDebugLoc());
4558           VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
4559                                                Pair.first, "bin.extra");
4560           propagateIRFlags(VectorizedTree, I);
4561         }
4562       }
4563       // Update users.
4564       ReductionRoot->replaceAllUsesWith(VectorizedTree);
4565     }
4566     return VectorizedTree != nullptr;
4567   }
4568 
4569   unsigned numReductionValues() const {
4570     return ReducedVals.size();
4571   }
4572 
4573 private:
4574   /// \brief Calculate the cost of a reduction.
4575   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
4576                        unsigned ReduxWidth) {
4577     Type *ScalarTy = FirstReducedVal->getType();
4578     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
4579 
4580     int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
4581     int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
4582 
4583     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
4584     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
4585 
4586     int ScalarReduxCost =
4587         (ReduxWidth - 1) *
4588         TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
4589 
4590     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
4591                  << " for reduction that starts with " << *FirstReducedVal
4592                  << " (It is a "
4593                  << (IsPairwiseReduction ? "pairwise" : "splitting")
4594                  << " reduction)\n");
4595 
4596     return VecReduxCost - ScalarReduxCost;
4597   }
4598 
4599   /// \brief Emit a horizontal reduction of the vectorized value.
4600   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
4601                        unsigned ReduxWidth, ArrayRef<Value *> RedOps,
4602                        const TargetTransformInfo *TTI) {
4603     assert(VectorizedValue && "Need to have a vectorized tree node");
4604     assert(isPowerOf2_32(ReduxWidth) &&
4605            "We only handle power-of-two reductions for now");
4606 
4607     if (!IsPairwiseReduction)
4608       return createSimpleTargetReduction(
4609           Builder, TTI, ReductionOpcode, VectorizedValue,
4610           TargetTransformInfo::ReductionFlags(), RedOps);
4611 
4612     Value *TmpVec = VectorizedValue;
4613     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
4614       Value *LeftMask =
4615           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
4616       Value *RightMask =
4617           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
4618 
4619       Value *LeftShuf = Builder.CreateShuffleVector(
4620           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
4621       Value *RightShuf = Builder.CreateShuffleVector(
4622           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
4623           "rdx.shuf.r");
4624       TmpVec =
4625           Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
4626       propagateIRFlags(TmpVec, RedOps);
4627     }
4628 
4629     // The result is in the first element of the vector.
4630     return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
4631   }
4632 };
4633 } // end anonymous namespace
4634 
4635 /// \brief Recognize construction of vectors like
4636 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
4637 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
4638 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
4639 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
4640 ///
4641 /// Returns true if it matches
4642 ///
4643 static bool findBuildVector(InsertElementInst *FirstInsertElem,
4644                             SmallVectorImpl<Value *> &BuildVector,
4645                             SmallVectorImpl<Value *> &BuildVectorOpds) {
4646   if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
4647     return false;
4648 
4649   InsertElementInst *IE = FirstInsertElem;
4650   while (true) {
4651     BuildVector.push_back(IE);
4652     BuildVectorOpds.push_back(IE->getOperand(1));
4653 
4654     if (IE->use_empty())
4655       return false;
4656 
4657     InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
4658     if (!NextUse)
4659       return true;
4660 
4661     // If this isn't the final use, make sure the next insertelement is the only
4662     // use. It's OK if the final constructed vector is used multiple times
4663     if (!IE->hasOneUse())
4664       return false;
4665 
4666     IE = NextUse;
4667   }
4668 
4669   return false;
4670 }
4671 
4672 /// \brief Like findBuildVector, but looks backwards for construction of aggregate.
4673 ///
4674 /// \return true if it matches.
4675 static bool findBuildAggregate(InsertValueInst *IV,
4676                                SmallVectorImpl<Value *> &BuildVector,
4677                                SmallVectorImpl<Value *> &BuildVectorOpds) {
4678   Value *V;
4679   do {
4680     BuildVector.push_back(IV);
4681     BuildVectorOpds.push_back(IV->getInsertedValueOperand());
4682     V = IV->getAggregateOperand();
4683     if (isa<UndefValue>(V))
4684       break;
4685     IV = dyn_cast<InsertValueInst>(V);
4686     if (!IV || !IV->hasOneUse())
4687       return false;
4688   } while (true);
4689   std::reverse(BuildVector.begin(), BuildVector.end());
4690   std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
4691   return true;
4692 }
4693 
4694 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
4695   return V->getType() < V2->getType();
4696 }
4697 
4698 /// \brief Try and get a reduction value from a phi node.
4699 ///
4700 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
4701 /// if they come from either \p ParentBB or a containing loop latch.
4702 ///
4703 /// \returns A candidate reduction value if possible, or \code nullptr \endcode
4704 /// if not possible.
4705 static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
4706                                 BasicBlock *ParentBB, LoopInfo *LI) {
4707   // There are situations where the reduction value is not dominated by the
4708   // reduction phi. Vectorizing such cases has been reported to cause
4709   // miscompiles. See PR25787.
4710   auto DominatedReduxValue = [&](Value *R) {
4711     return (
4712         dyn_cast<Instruction>(R) &&
4713         DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
4714   };
4715 
4716   Value *Rdx = nullptr;
4717 
4718   // Return the incoming value if it comes from the same BB as the phi node.
4719   if (P->getIncomingBlock(0) == ParentBB) {
4720     Rdx = P->getIncomingValue(0);
4721   } else if (P->getIncomingBlock(1) == ParentBB) {
4722     Rdx = P->getIncomingValue(1);
4723   }
4724 
4725   if (Rdx && DominatedReduxValue(Rdx))
4726     return Rdx;
4727 
4728   // Otherwise, check whether we have a loop latch to look at.
4729   Loop *BBL = LI->getLoopFor(ParentBB);
4730   if (!BBL)
4731     return nullptr;
4732   BasicBlock *BBLatch = BBL->getLoopLatch();
4733   if (!BBLatch)
4734     return nullptr;
4735 
4736   // There is a loop latch, return the incoming value if it comes from
4737   // that. This reduction pattern occasionally turns up.
4738   if (P->getIncomingBlock(0) == BBLatch) {
4739     Rdx = P->getIncomingValue(0);
4740   } else if (P->getIncomingBlock(1) == BBLatch) {
4741     Rdx = P->getIncomingValue(1);
4742   }
4743 
4744   if (Rdx && DominatedReduxValue(Rdx))
4745     return Rdx;
4746 
4747   return nullptr;
4748 }
4749 
4750 namespace {
4751 /// Tracks instructons and its children.
4752 class WeakTrackingVHWithLevel final : public CallbackVH {
4753   /// Operand index of the instruction currently beeing analized.
4754   unsigned Level = 0;
4755   /// Is this the instruction that should be vectorized, or are we now
4756   /// processing children (i.e. operands of this instruction) for potential
4757   /// vectorization?
4758   bool IsInitial = true;
4759 
4760 public:
4761   explicit WeakTrackingVHWithLevel() = default;
4762   WeakTrackingVHWithLevel(Value *V) : CallbackVH(V){};
4763   /// Restart children analysis each time it is repaced by the new instruction.
4764   void allUsesReplacedWith(Value *New) override {
4765     setValPtr(New);
4766     Level = 0;
4767     IsInitial = true;
4768   }
4769   /// Check if the instruction was not deleted during vectorization.
4770   bool isValid() const { return !getValPtr(); }
4771   /// Is the istruction itself must be vectorized?
4772   bool isInitial() const { return IsInitial; }
4773   /// Try to vectorize children.
4774   void clearInitial() { IsInitial = false; }
4775   /// Are all children processed already?
4776   bool isFinal() const {
4777     assert(getValPtr() &&
4778            (isa<Instruction>(getValPtr()) &&
4779             cast<Instruction>(getValPtr())->getNumOperands() >= Level));
4780     return getValPtr() &&
4781            cast<Instruction>(getValPtr())->getNumOperands() == Level;
4782   }
4783   /// Get next child operation.
4784   Value *nextOperand() {
4785     assert(getValPtr() && isa<Instruction>(getValPtr()) &&
4786            cast<Instruction>(getValPtr())->getNumOperands() > Level);
4787     return cast<Instruction>(getValPtr())->getOperand(Level++);
4788   }
4789   virtual ~WeakTrackingVHWithLevel() = default;
4790 };
4791 } // namespace
4792 
4793 /// \brief Attempt to reduce a horizontal reduction.
4794 /// If it is legal to match a horizontal reduction feeding
4795 /// the phi node P with reduction operators Root in a basic block BB, then check
4796 /// if it can be done.
4797 /// \returns true if a horizontal reduction was matched and reduced.
4798 /// \returns false if a horizontal reduction was not matched.
4799 static bool canBeVectorized(
4800     PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
4801     TargetTransformInfo *TTI,
4802     const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
4803   if (!ShouldVectorizeHor)
4804     return false;
4805 
4806   if (!Root)
4807     return false;
4808 
4809   if (Root->getParent() != BB)
4810     return false;
4811   SmallVector<WeakTrackingVHWithLevel, 8> Stack(1, Root);
4812   SmallSet<Value *, 8> VisitedInstrs;
4813   bool Res = false;
4814   while (!Stack.empty()) {
4815     Value *V = Stack.back();
4816     if (!V) {
4817       Stack.pop_back();
4818       continue;
4819     }
4820     auto *Inst = dyn_cast<Instruction>(V);
4821     if (!Inst || isa<PHINode>(Inst)) {
4822       Stack.pop_back();
4823       continue;
4824     }
4825     if (Stack.back().isInitial()) {
4826       Stack.back().clearInitial();
4827       if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
4828         HorizontalReduction HorRdx;
4829         if (HorRdx.matchAssociativeReduction(P, BI)) {
4830           if (HorRdx.tryToReduce(R, TTI)) {
4831             Res = true;
4832             P = nullptr;
4833             continue;
4834           }
4835         }
4836         if (P) {
4837           Inst = dyn_cast<Instruction>(BI->getOperand(0));
4838           if (Inst == P)
4839             Inst = dyn_cast<Instruction>(BI->getOperand(1));
4840           if (!Inst) {
4841             P = nullptr;
4842             continue;
4843           }
4844         }
4845       }
4846       P = nullptr;
4847       if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
4848         Res = true;
4849         continue;
4850       }
4851     }
4852     if (Stack.back().isFinal()) {
4853       Stack.pop_back();
4854       continue;
4855     }
4856 
4857     if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))
4858       if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
4859           Stack.size() < RecursionMaxDepth)
4860         Stack.push_back(NextV);
4861   }
4862   return Res;
4863 }
4864 
4865 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
4866                                                  BasicBlock *BB, BoUpSLP &R,
4867                                                  TargetTransformInfo *TTI) {
4868   if (!V)
4869     return false;
4870   auto *I = dyn_cast<Instruction>(V);
4871   if (!I)
4872     return false;
4873 
4874   if (!isa<BinaryOperator>(I))
4875     P = nullptr;
4876   // Try to match and vectorize a horizontal reduction.
4877   return canBeVectorized(P, I, BB, R, TTI,
4878                          [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
4879                            return tryToVectorize(BI, R);
4880                          });
4881 }
4882 
4883 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
4884   bool Changed = false;
4885   SmallVector<Value *, 4> Incoming;
4886   SmallSet<Value *, 16> VisitedInstrs;
4887 
4888   bool HaveVectorizedPhiNodes = true;
4889   while (HaveVectorizedPhiNodes) {
4890     HaveVectorizedPhiNodes = false;
4891 
4892     // Collect the incoming values from the PHIs.
4893     Incoming.clear();
4894     for (Instruction &I : *BB) {
4895       PHINode *P = dyn_cast<PHINode>(&I);
4896       if (!P)
4897         break;
4898 
4899       if (!VisitedInstrs.count(P))
4900         Incoming.push_back(P);
4901     }
4902 
4903     // Sort by type.
4904     std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
4905 
4906     // Try to vectorize elements base on their type.
4907     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
4908                                            E = Incoming.end();
4909          IncIt != E;) {
4910 
4911       // Look for the next elements with the same type.
4912       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
4913       while (SameTypeIt != E &&
4914              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
4915         VisitedInstrs.insert(*SameTypeIt);
4916         ++SameTypeIt;
4917       }
4918 
4919       // Try to vectorize them.
4920       unsigned NumElts = (SameTypeIt - IncIt);
4921       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
4922       // The order in which the phi nodes appear in the program does not matter.
4923       // So allow tryToVectorizeList to reorder them if it is beneficial. This
4924       // is done when there are exactly two elements since tryToVectorizeList
4925       // asserts that there are only two values when AllowReorder is true.
4926       bool AllowReorder = NumElts == 2;
4927       if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
4928                                             None, AllowReorder)) {
4929         // Success start over because instructions might have been changed.
4930         HaveVectorizedPhiNodes = true;
4931         Changed = true;
4932         break;
4933       }
4934 
4935       // Start over at the next instruction of a different type (or the end).
4936       IncIt = SameTypeIt;
4937     }
4938   }
4939 
4940   VisitedInstrs.clear();
4941 
4942   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
4943     // We may go through BB multiple times so skip the one we have checked.
4944     if (!VisitedInstrs.insert(&*it).second)
4945       continue;
4946 
4947     if (isa<DbgInfoIntrinsic>(it))
4948       continue;
4949 
4950     // Try to vectorize reductions that use PHINodes.
4951     if (PHINode *P = dyn_cast<PHINode>(it)) {
4952       // Check that the PHI is a reduction PHI.
4953       if (P->getNumIncomingValues() != 2)
4954         return Changed;
4955 
4956       // Try to match and vectorize a horizontal reduction.
4957       if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
4958                                    TTI)) {
4959         Changed = true;
4960         it = BB->begin();
4961         e = BB->end();
4962         continue;
4963       }
4964       continue;
4965     }
4966 
4967     if (ShouldStartVectorizeHorAtStore) {
4968       if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
4969         // Try to match and vectorize a horizontal reduction.
4970         if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
4971                                      TTI)) {
4972           Changed = true;
4973           it = BB->begin();
4974           e = BB->end();
4975           continue;
4976         }
4977       }
4978     }
4979 
4980     // Try to vectorize horizontal reductions feeding into a return.
4981     if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
4982       if (RI->getNumOperands() != 0) {
4983         // Try to match and vectorize a horizontal reduction.
4984         if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
4985           Changed = true;
4986           it = BB->begin();
4987           e = BB->end();
4988           continue;
4989         }
4990       }
4991     }
4992 
4993     // Try to vectorize trees that start at compare instructions.
4994     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
4995       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
4996         Changed = true;
4997         // We would like to start over since some instructions are deleted
4998         // and the iterator may become invalid value.
4999         it = BB->begin();
5000         e = BB->end();
5001         continue;
5002       }
5003 
5004       for (int I = 0; I < 2; ++I) {
5005         if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
5006           Changed = true;
5007           // We would like to start over since some instructions are deleted
5008           // and the iterator may become invalid value.
5009           it = BB->begin();
5010           e = BB->end();
5011           break;
5012         }
5013       }
5014       continue;
5015     }
5016 
5017     // Try to vectorize trees that start at insertelement instructions.
5018     if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
5019       SmallVector<Value *, 16> BuildVector;
5020       SmallVector<Value *, 16> BuildVectorOpds;
5021       if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
5022         continue;
5023 
5024       // Vectorize starting with the build vector operands ignoring the
5025       // BuildVector instructions for the purpose of scheduling and user
5026       // extraction.
5027       if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
5028         Changed = true;
5029         it = BB->begin();
5030         e = BB->end();
5031       }
5032 
5033       continue;
5034     }
5035 
5036     // Try to vectorize trees that start at insertvalue instructions feeding into
5037     // a store.
5038     if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
5039       if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
5040         const DataLayout &DL = BB->getModule()->getDataLayout();
5041         if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
5042           SmallVector<Value *, 16> BuildVector;
5043           SmallVector<Value *, 16> BuildVectorOpds;
5044           if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
5045             continue;
5046 
5047           DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
5048           if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
5049             Changed = true;
5050             it = BB->begin();
5051             e = BB->end();
5052           }
5053           continue;
5054         }
5055       }
5056     }
5057   }
5058 
5059   return Changed;
5060 }
5061 
5062 bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
5063   auto Changed = false;
5064   for (auto &Entry : GEPs) {
5065 
5066     // If the getelementptr list has fewer than two elements, there's nothing
5067     // to do.
5068     if (Entry.second.size() < 2)
5069       continue;
5070 
5071     DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
5072                  << Entry.second.size() << ".\n");
5073 
5074     // We process the getelementptr list in chunks of 16 (like we do for
5075     // stores) to minimize compile-time.
5076     for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
5077       auto Len = std::min<unsigned>(BE - BI, 16);
5078       auto GEPList = makeArrayRef(&Entry.second[BI], Len);
5079 
5080       // Initialize a set a candidate getelementptrs. Note that we use a
5081       // SetVector here to preserve program order. If the index computations
5082       // are vectorizable and begin with loads, we want to minimize the chance
5083       // of having to reorder them later.
5084       SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
5085 
5086       // Some of the candidates may have already been vectorized after we
5087       // initially collected them. If so, the WeakTrackingVHs will have
5088       // nullified the
5089       // values, so remove them from the set of candidates.
5090       Candidates.remove(nullptr);
5091 
5092       // Remove from the set of candidates all pairs of getelementptrs with
5093       // constant differences. Such getelementptrs are likely not good
5094       // candidates for vectorization in a bottom-up phase since one can be
5095       // computed from the other. We also ensure all candidate getelementptr
5096       // indices are unique.
5097       for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
5098         auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
5099         if (!Candidates.count(GEPI))
5100           continue;
5101         auto *SCEVI = SE->getSCEV(GEPList[I]);
5102         for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
5103           auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
5104           auto *SCEVJ = SE->getSCEV(GEPList[J]);
5105           if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
5106             Candidates.remove(GEPList[I]);
5107             Candidates.remove(GEPList[J]);
5108           } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
5109             Candidates.remove(GEPList[J]);
5110           }
5111         }
5112       }
5113 
5114       // We break out of the above computation as soon as we know there are
5115       // fewer than two candidates remaining.
5116       if (Candidates.size() < 2)
5117         continue;
5118 
5119       // Add the single, non-constant index of each candidate to the bundle. We
5120       // ensured the indices met these constraints when we originally collected
5121       // the getelementptrs.
5122       SmallVector<Value *, 16> Bundle(Candidates.size());
5123       auto BundleIndex = 0u;
5124       for (auto *V : Candidates) {
5125         auto *GEP = cast<GetElementPtrInst>(V);
5126         auto *GEPIdx = GEP->idx_begin()->get();
5127         assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
5128         Bundle[BundleIndex++] = GEPIdx;
5129       }
5130 
5131       // Try and vectorize the indices. We are currently only interested in
5132       // gather-like cases of the form:
5133       //
5134       // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
5135       //
5136       // where the loads of "a", the loads of "b", and the subtractions can be
5137       // performed in parallel. It's likely that detecting this pattern in a
5138       // bottom-up phase will be simpler and less costly than building a
5139       // full-blown top-down phase beginning at the consecutive loads.
5140       Changed |= tryToVectorizeList(Bundle, R);
5141     }
5142   }
5143   return Changed;
5144 }
5145 
5146 bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
5147   bool Changed = false;
5148   // Attempt to sort and vectorize each of the store-groups.
5149   for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
5150        ++it) {
5151     if (it->second.size() < 2)
5152       continue;
5153 
5154     DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
5155           << it->second.size() << ".\n");
5156 
5157     // Process the stores in chunks of 16.
5158     // TODO: The limit of 16 inhibits greater vectorization factors.
5159     //       For example, AVX2 supports v32i8. Increasing this limit, however,
5160     //       may cause a significant compile-time increase.
5161     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
5162       unsigned Len = std::min<unsigned>(CE - CI, 16);
5163       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
5164     }
5165   }
5166   return Changed;
5167 }
5168 
5169 char SLPVectorizer::ID = 0;
5170 static const char lv_name[] = "SLP Vectorizer";
5171 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
5172 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
5173 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
5174 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
5175 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
5176 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
5177 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
5178 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
5179 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
5180 
5181 namespace llvm {
5182 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
5183 }
5184