1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18 #include "llvm/ADT/MapVector.h"
19 #include "llvm/ADT/Optional.h"
20 #include "llvm/ADT/PostOrderIterator.h"
21 #include "llvm/ADT/SetVector.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/AliasAnalysis.h"
24 #include "llvm/Analysis/AssumptionCache.h"
25 #include "llvm/Analysis/CodeMetrics.h"
26 #include "llvm/Analysis/DemandedBits.h"
27 #include "llvm/Analysis/GlobalsModRef.h"
28 #include "llvm/Analysis/LoopAccessAnalysis.h"
29 #include "llvm/Analysis/LoopAccessAnalysis.h"
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/Analysis/ScalarEvolution.h"
32 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
33 #include "llvm/Analysis/TargetTransformInfo.h"
34 #include "llvm/Analysis/ValueTracking.h"
35 #include "llvm/Analysis/VectorUtils.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/Dominators.h"
38 #include "llvm/IR/IRBuilder.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/IntrinsicInst.h"
41 #include "llvm/IR/Module.h"
42 #include "llvm/IR/NoFolder.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/IR/Value.h"
45 #include "llvm/IR/Verifier.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/CommandLine.h"
48 #include "llvm/Support/Debug.h"
49 #include "llvm/Support/raw_ostream.h"
50 #include "llvm/Transforms/Vectorize.h"
51 #include <algorithm>
52 #include <memory>
53 
54 using namespace llvm;
55 
56 #define SV_NAME "slp-vectorizer"
57 #define DEBUG_TYPE "SLP"
58 
59 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
60 
61 static cl::opt<int>
62     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
63                      cl::desc("Only vectorize if you gain more than this "
64                               "number "));
65 
66 static cl::opt<bool>
67 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
68                    cl::desc("Attempt to vectorize horizontal reductions"));
69 
70 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
71     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
72     cl::desc(
73         "Attempt to vectorize horizontal reductions feeding into a store"));
74 
75 static cl::opt<int>
76 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
77     cl::desc("Attempt to vectorize for this register size in bits"));
78 
79 /// Limits the size of scheduling regions in a block.
80 /// It avoid long compile times for _very_ large blocks where vector
81 /// instructions are spread over a wide range.
82 /// This limit is way higher than needed by real-world functions.
83 static cl::opt<int>
84 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
85     cl::desc("Limit the size of the SLP scheduling region per block"));
86 
87 static cl::opt<int> MinVectorRegSizeOption(
88     "slp-min-reg-size", cl::init(128), cl::Hidden,
89     cl::desc("Attempt to vectorize for this register size in bits"));
90 
91 namespace {
92 
93 // FIXME: Set this via cl::opt to allow overriding.
94 static const unsigned RecursionMaxDepth = 12;
95 
96 // Limit the number of alias checks. The limit is chosen so that
97 // it has no negative effect on the llvm benchmarks.
98 static const unsigned AliasedCheckLimit = 10;
99 
100 // Another limit for the alias checks: The maximum distance between load/store
101 // instructions where alias checks are done.
102 // This limit is useful for very large basic blocks.
103 static const unsigned MaxMemDepDistance = 160;
104 
105 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
106 /// regions to be handled.
107 static const int MinScheduleRegionSize = 16;
108 
109 /// \brief Predicate for the element types that the SLP vectorizer supports.
110 ///
111 /// The most important thing to filter here are types which are invalid in LLVM
112 /// vectors. We also filter target specific types which have absolutely no
113 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
114 /// avoids spending time checking the cost model and realizing that they will
115 /// be inevitably scalarized.
116 static bool isValidElementType(Type *Ty) {
117   return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
118          !Ty->isPPC_FP128Ty();
119 }
120 
121 /// \returns the parent basic block if all of the instructions in \p VL
122 /// are in the same block or null otherwise.
123 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
124   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
125   if (!I0)
126     return nullptr;
127   BasicBlock *BB = I0->getParent();
128   for (int i = 1, e = VL.size(); i < e; i++) {
129     Instruction *I = dyn_cast<Instruction>(VL[i]);
130     if (!I)
131       return nullptr;
132 
133     if (BB != I->getParent())
134       return nullptr;
135   }
136   return BB;
137 }
138 
139 /// \returns True if all of the values in \p VL are constants.
140 static bool allConstant(ArrayRef<Value *> VL) {
141   for (unsigned i = 0, e = VL.size(); i < e; ++i)
142     if (!isa<Constant>(VL[i]))
143       return false;
144   return true;
145 }
146 
147 /// \returns True if all of the values in \p VL are identical.
148 static bool isSplat(ArrayRef<Value *> VL) {
149   for (unsigned i = 1, e = VL.size(); i < e; ++i)
150     if (VL[i] != VL[0])
151       return false;
152   return true;
153 }
154 
155 ///\returns Opcode that can be clubbed with \p Op to create an alternate
156 /// sequence which can later be merged as a ShuffleVector instruction.
157 static unsigned getAltOpcode(unsigned Op) {
158   switch (Op) {
159   case Instruction::FAdd:
160     return Instruction::FSub;
161   case Instruction::FSub:
162     return Instruction::FAdd;
163   case Instruction::Add:
164     return Instruction::Sub;
165   case Instruction::Sub:
166     return Instruction::Add;
167   default:
168     return 0;
169   }
170 }
171 
172 ///\returns bool representing if Opcode \p Op can be part
173 /// of an alternate sequence which can later be merged as
174 /// a ShuffleVector instruction.
175 static bool canCombineAsAltInst(unsigned Op) {
176   return Op == Instruction::FAdd || Op == Instruction::FSub ||
177          Op == Instruction::Sub || Op == Instruction::Add;
178 }
179 
180 /// \returns ShuffleVector instruction if instructions in \p VL have
181 ///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
182 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
183 static unsigned isAltInst(ArrayRef<Value *> VL) {
184   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
185   unsigned Opcode = I0->getOpcode();
186   unsigned AltOpcode = getAltOpcode(Opcode);
187   for (int i = 1, e = VL.size(); i < e; i++) {
188     Instruction *I = dyn_cast<Instruction>(VL[i]);
189     if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
190       return 0;
191   }
192   return Instruction::ShuffleVector;
193 }
194 
195 /// \returns The opcode if all of the Instructions in \p VL have the same
196 /// opcode, or zero.
197 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
198   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
199   if (!I0)
200     return 0;
201   unsigned Opcode = I0->getOpcode();
202   for (int i = 1, e = VL.size(); i < e; i++) {
203     Instruction *I = dyn_cast<Instruction>(VL[i]);
204     if (!I || Opcode != I->getOpcode()) {
205       if (canCombineAsAltInst(Opcode) && i == 1)
206         return isAltInst(VL);
207       return 0;
208     }
209   }
210   return Opcode;
211 }
212 
213 /// Get the intersection (logical and) of all of the potential IR flags
214 /// of each scalar operation (VL) that will be converted into a vector (I).
215 /// Flag set: NSW, NUW, exact, and all of fast-math.
216 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
217   if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
218     if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
219       // Intersection is initialized to the 0th scalar,
220       // so start counting from index '1'.
221       for (int i = 1, e = VL.size(); i < e; ++i) {
222         if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
223           Intersection->andIRFlags(Scalar);
224       }
225       VecOp->copyIRFlags(Intersection);
226     }
227   }
228 }
229 
230 /// \returns \p I after propagating metadata from \p VL.
231 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
232   Instruction *I0 = cast<Instruction>(VL[0]);
233   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
234   I0->getAllMetadataOtherThanDebugLoc(Metadata);
235 
236   for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
237     unsigned Kind = Metadata[i].first;
238     MDNode *MD = Metadata[i].second;
239 
240     for (int i = 1, e = VL.size(); MD && i != e; i++) {
241       Instruction *I = cast<Instruction>(VL[i]);
242       MDNode *IMD = I->getMetadata(Kind);
243 
244       switch (Kind) {
245       default:
246         MD = nullptr; // Remove unknown metadata
247         break;
248       case LLVMContext::MD_tbaa:
249         MD = MDNode::getMostGenericTBAA(MD, IMD);
250         break;
251       case LLVMContext::MD_alias_scope:
252         MD = MDNode::getMostGenericAliasScope(MD, IMD);
253         break;
254       case LLVMContext::MD_noalias:
255         MD = MDNode::intersect(MD, IMD);
256         break;
257       case LLVMContext::MD_fpmath:
258         MD = MDNode::getMostGenericFPMath(MD, IMD);
259         break;
260       case LLVMContext::MD_nontemporal:
261         MD = MDNode::intersect(MD, IMD);
262         break;
263       }
264     }
265     I->setMetadata(Kind, MD);
266   }
267   return I;
268 }
269 
270 /// \returns The type that all of the values in \p VL have or null if there
271 /// are different types.
272 static Type* getSameType(ArrayRef<Value *> VL) {
273   Type *Ty = VL[0]->getType();
274   for (int i = 1, e = VL.size(); i < e; i++)
275     if (VL[i]->getType() != Ty)
276       return nullptr;
277 
278   return Ty;
279 }
280 
281 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
282 static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
283   assert(Opcode == Instruction::ExtractElement ||
284          Opcode == Instruction::ExtractValue);
285   if (Opcode == Instruction::ExtractElement) {
286     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
287     return CI && CI->getZExtValue() == Idx;
288   } else {
289     ExtractValueInst *EI = cast<ExtractValueInst>(E);
290     return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
291   }
292 }
293 
294 /// \returns True if in-tree use also needs extract. This refers to
295 /// possible scalar operand in vectorized instruction.
296 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
297                                     TargetLibraryInfo *TLI) {
298 
299   unsigned Opcode = UserInst->getOpcode();
300   switch (Opcode) {
301   case Instruction::Load: {
302     LoadInst *LI = cast<LoadInst>(UserInst);
303     return (LI->getPointerOperand() == Scalar);
304   }
305   case Instruction::Store: {
306     StoreInst *SI = cast<StoreInst>(UserInst);
307     return (SI->getPointerOperand() == Scalar);
308   }
309   case Instruction::Call: {
310     CallInst *CI = cast<CallInst>(UserInst);
311     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
312     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
313       return (CI->getArgOperand(1) == Scalar);
314     }
315   }
316   default:
317     return false;
318   }
319 }
320 
321 /// \returns the AA location that is being access by the instruction.
322 static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
323   if (StoreInst *SI = dyn_cast<StoreInst>(I))
324     return MemoryLocation::get(SI);
325   if (LoadInst *LI = dyn_cast<LoadInst>(I))
326     return MemoryLocation::get(LI);
327   return MemoryLocation();
328 }
329 
330 /// \returns True if the instruction is not a volatile or atomic load/store.
331 static bool isSimple(Instruction *I) {
332   if (LoadInst *LI = dyn_cast<LoadInst>(I))
333     return LI->isSimple();
334   if (StoreInst *SI = dyn_cast<StoreInst>(I))
335     return SI->isSimple();
336   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
337     return !MI->isVolatile();
338   return true;
339 }
340 
341 /// Bottom Up SLP Vectorizer.
342 class BoUpSLP {
343 public:
344   typedef SmallVector<Value *, 8> ValueList;
345   typedef SmallVector<Instruction *, 16> InstrList;
346   typedef SmallPtrSet<Value *, 16> ValueSet;
347   typedef SmallVector<StoreInst *, 8> StoreList;
348 
349   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
350           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
351           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
352           const DataLayout *DL)
353       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
354         SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
355         DL(DL), Builder(Se->getContext()) {
356     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
357     // Use the vector register size specified by the target unless overridden
358     // by a command-line option.
359     // TODO: It would be better to limit the vectorization factor based on
360     //       data type rather than just register size. For example, x86 AVX has
361     //       256-bit registers, but it does not support integer operations
362     //       at that width (that requires AVX2).
363     if (MaxVectorRegSizeOption.getNumOccurrences())
364       MaxVecRegSize = MaxVectorRegSizeOption;
365     else
366       MaxVecRegSize = TTI->getRegisterBitWidth(true);
367 
368     MinVecRegSize = MinVectorRegSizeOption;
369   }
370 
371   /// \brief Vectorize the tree that starts with the elements in \p VL.
372   /// Returns the vectorized root.
373   Value *vectorizeTree();
374 
375   /// \returns the cost incurred by unwanted spills and fills, caused by
376   /// holding live values over call sites.
377   int getSpillCost();
378 
379   /// \returns the vectorization cost of the subtree that starts at \p VL.
380   /// A negative number means that this is profitable.
381   int getTreeCost();
382 
383   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
384   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
385   void buildTree(ArrayRef<Value *> Roots,
386                  ArrayRef<Value *> UserIgnoreLst = None);
387 
388   /// Clear the internal data structures that are created by 'buildTree'.
389   void deleteTree() {
390     VectorizableTree.clear();
391     ScalarToTreeEntry.clear();
392     MustGather.clear();
393     ExternalUses.clear();
394     NumLoadsWantToKeepOrder = 0;
395     NumLoadsWantToChangeOrder = 0;
396     for (auto &Iter : BlocksSchedules) {
397       BlockScheduling *BS = Iter.second.get();
398       BS->clear();
399     }
400     MinBWs.clear();
401   }
402 
403   /// \brief Perform LICM and CSE on the newly generated gather sequences.
404   void optimizeGatherSequence();
405 
406   /// \returns true if it is beneficial to reverse the vector order.
407   bool shouldReorder() const {
408     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
409   }
410 
411   /// \return The vector element size in bits to use when vectorizing the
412   /// expression tree ending at \p V. If V is a store, the size is the width of
413   /// the stored value. Otherwise, the size is the width of the largest loaded
414   /// value reaching V. This method is used by the vectorizer to calculate
415   /// vectorization factors.
416   unsigned getVectorElementSize(Value *V);
417 
418   /// Compute the minimum type sizes required to represent the entries in a
419   /// vectorizable tree.
420   void computeMinimumValueSizes();
421 
422   // \returns maximum vector register size as set by TTI or overridden by cl::opt.
423   unsigned getMaxVecRegSize() const {
424     return MaxVecRegSize;
425   }
426 
427   // \returns minimum vector register size as set by cl::opt.
428   unsigned getMinVecRegSize() const {
429     return MinVecRegSize;
430   }
431 
432   /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
433   ///
434   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
435   unsigned canMapToVector(Type *T, const DataLayout &DL) const;
436 
437 private:
438   struct TreeEntry;
439 
440   /// \returns the cost of the vectorizable entry.
441   int getEntryCost(TreeEntry *E);
442 
443   /// This is the recursive part of buildTree.
444   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
445 
446   /// \returns True if the ExtractElement/ExtractValue instructions in VL can
447   /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
448   bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
449 
450   /// Vectorize a single entry in the tree.
451   Value *vectorizeTree(TreeEntry *E);
452 
453   /// Vectorize a single entry in the tree, starting in \p VL.
454   Value *vectorizeTree(ArrayRef<Value *> VL);
455 
456   /// \returns the pointer to the vectorized value if \p VL is already
457   /// vectorized, or NULL. They may happen in cycles.
458   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
459 
460   /// \returns the scalarization cost for this type. Scalarization in this
461   /// context means the creation of vectors from a group of scalars.
462   int getGatherCost(Type *Ty);
463 
464   /// \returns the scalarization cost for this list of values. Assuming that
465   /// this subtree gets vectorized, we may need to extract the values from the
466   /// roots. This method calculates the cost of extracting the values.
467   int getGatherCost(ArrayRef<Value *> VL);
468 
469   /// \brief Set the Builder insert point to one after the last instruction in
470   /// the bundle
471   void setInsertPointAfterBundle(ArrayRef<Value *> VL);
472 
473   /// \returns a vector from a collection of scalars in \p VL.
474   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
475 
476   /// \returns whether the VectorizableTree is fully vectorizable and will
477   /// be beneficial even the tree height is tiny.
478   bool isFullyVectorizableTinyTree();
479 
480   /// \reorder commutative operands in alt shuffle if they result in
481   ///  vectorized code.
482   void reorderAltShuffleOperands(ArrayRef<Value *> VL,
483                                  SmallVectorImpl<Value *> &Left,
484                                  SmallVectorImpl<Value *> &Right);
485   /// \reorder commutative operands to get better probability of
486   /// generating vectorized code.
487   void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
488                                       SmallVectorImpl<Value *> &Left,
489                                       SmallVectorImpl<Value *> &Right);
490   struct TreeEntry {
491     TreeEntry() : Scalars(), VectorizedValue(nullptr),
492     NeedToGather(0) {}
493 
494     /// \returns true if the scalars in VL are equal to this entry.
495     bool isSame(ArrayRef<Value *> VL) const {
496       assert(VL.size() == Scalars.size() && "Invalid size");
497       return std::equal(VL.begin(), VL.end(), Scalars.begin());
498     }
499 
500     /// A vector of scalars.
501     ValueList Scalars;
502 
503     /// The Scalars are vectorized into this value. It is initialized to Null.
504     Value *VectorizedValue;
505 
506     /// Do we need to gather this sequence ?
507     bool NeedToGather;
508   };
509 
510   /// Create a new VectorizableTree entry.
511   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
512     VectorizableTree.emplace_back();
513     int idx = VectorizableTree.size() - 1;
514     TreeEntry *Last = &VectorizableTree[idx];
515     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
516     Last->NeedToGather = !Vectorized;
517     if (Vectorized) {
518       for (int i = 0, e = VL.size(); i != e; ++i) {
519         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
520         ScalarToTreeEntry[VL[i]] = idx;
521       }
522     } else {
523       MustGather.insert(VL.begin(), VL.end());
524     }
525     return Last;
526   }
527 
528   /// -- Vectorization State --
529   /// Holds all of the tree entries.
530   std::vector<TreeEntry> VectorizableTree;
531 
532   /// Maps a specific scalar to its tree entry.
533   SmallDenseMap<Value*, int> ScalarToTreeEntry;
534 
535   /// A list of scalars that we found that we need to keep as scalars.
536   ValueSet MustGather;
537 
538   /// This POD struct describes one external user in the vectorized tree.
539   struct ExternalUser {
540     ExternalUser (Value *S, llvm::User *U, int L) :
541       Scalar(S), User(U), Lane(L){}
542     // Which scalar in our function.
543     Value *Scalar;
544     // Which user that uses the scalar.
545     llvm::User *User;
546     // Which lane does the scalar belong to.
547     int Lane;
548   };
549   typedef SmallVector<ExternalUser, 16> UserList;
550 
551   /// Checks if two instructions may access the same memory.
552   ///
553   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
554   /// is invariant in the calling loop.
555   bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
556                  Instruction *Inst2) {
557 
558     // First check if the result is already in the cache.
559     AliasCacheKey key = std::make_pair(Inst1, Inst2);
560     Optional<bool> &result = AliasCache[key];
561     if (result.hasValue()) {
562       return result.getValue();
563     }
564     MemoryLocation Loc2 = getLocation(Inst2, AA);
565     bool aliased = true;
566     if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
567       // Do the alias check.
568       aliased = AA->alias(Loc1, Loc2);
569     }
570     // Store the result in the cache.
571     result = aliased;
572     return aliased;
573   }
574 
575   typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
576 
577   /// Cache for alias results.
578   /// TODO: consider moving this to the AliasAnalysis itself.
579   DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
580 
581   /// Removes an instruction from its block and eventually deletes it.
582   /// It's like Instruction::eraseFromParent() except that the actual deletion
583   /// is delayed until BoUpSLP is destructed.
584   /// This is required to ensure that there are no incorrect collisions in the
585   /// AliasCache, which can happen if a new instruction is allocated at the
586   /// same address as a previously deleted instruction.
587   void eraseInstruction(Instruction *I) {
588     I->removeFromParent();
589     I->dropAllReferences();
590     DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
591   }
592 
593   /// Temporary store for deleted instructions. Instructions will be deleted
594   /// eventually when the BoUpSLP is destructed.
595   SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
596 
597   /// A list of values that need to extracted out of the tree.
598   /// This list holds pairs of (Internal Scalar : External User).
599   UserList ExternalUses;
600 
601   /// Values used only by @llvm.assume calls.
602   SmallPtrSet<const Value *, 32> EphValues;
603 
604   /// Holds all of the instructions that we gathered.
605   SetVector<Instruction *> GatherSeq;
606   /// A list of blocks that we are going to CSE.
607   SetVector<BasicBlock *> CSEBlocks;
608 
609   /// Contains all scheduling relevant data for an instruction.
610   /// A ScheduleData either represents a single instruction or a member of an
611   /// instruction bundle (= a group of instructions which is combined into a
612   /// vector instruction).
613   struct ScheduleData {
614 
615     // The initial value for the dependency counters. It means that the
616     // dependencies are not calculated yet.
617     enum { InvalidDeps = -1 };
618 
619     ScheduleData()
620         : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
621           NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
622           Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
623           UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
624 
625     void init(int BlockSchedulingRegionID) {
626       FirstInBundle = this;
627       NextInBundle = nullptr;
628       NextLoadStore = nullptr;
629       IsScheduled = false;
630       SchedulingRegionID = BlockSchedulingRegionID;
631       UnscheduledDepsInBundle = UnscheduledDeps;
632       clearDependencies();
633     }
634 
635     /// Returns true if the dependency information has been calculated.
636     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
637 
638     /// Returns true for single instructions and for bundle representatives
639     /// (= the head of a bundle).
640     bool isSchedulingEntity() const { return FirstInBundle == this; }
641 
642     /// Returns true if it represents an instruction bundle and not only a
643     /// single instruction.
644     bool isPartOfBundle() const {
645       return NextInBundle != nullptr || FirstInBundle != this;
646     }
647 
648     /// Returns true if it is ready for scheduling, i.e. it has no more
649     /// unscheduled depending instructions/bundles.
650     bool isReady() const {
651       assert(isSchedulingEntity() &&
652              "can't consider non-scheduling entity for ready list");
653       return UnscheduledDepsInBundle == 0 && !IsScheduled;
654     }
655 
656     /// Modifies the number of unscheduled dependencies, also updating it for
657     /// the whole bundle.
658     int incrementUnscheduledDeps(int Incr) {
659       UnscheduledDeps += Incr;
660       return FirstInBundle->UnscheduledDepsInBundle += Incr;
661     }
662 
663     /// Sets the number of unscheduled dependencies to the number of
664     /// dependencies.
665     void resetUnscheduledDeps() {
666       incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
667     }
668 
669     /// Clears all dependency information.
670     void clearDependencies() {
671       Dependencies = InvalidDeps;
672       resetUnscheduledDeps();
673       MemoryDependencies.clear();
674     }
675 
676     void dump(raw_ostream &os) const {
677       if (!isSchedulingEntity()) {
678         os << "/ " << *Inst;
679       } else if (NextInBundle) {
680         os << '[' << *Inst;
681         ScheduleData *SD = NextInBundle;
682         while (SD) {
683           os << ';' << *SD->Inst;
684           SD = SD->NextInBundle;
685         }
686         os << ']';
687       } else {
688         os << *Inst;
689       }
690     }
691 
692     Instruction *Inst;
693 
694     /// Points to the head in an instruction bundle (and always to this for
695     /// single instructions).
696     ScheduleData *FirstInBundle;
697 
698     /// Single linked list of all instructions in a bundle. Null if it is a
699     /// single instruction.
700     ScheduleData *NextInBundle;
701 
702     /// Single linked list of all memory instructions (e.g. load, store, call)
703     /// in the block - until the end of the scheduling region.
704     ScheduleData *NextLoadStore;
705 
706     /// The dependent memory instructions.
707     /// This list is derived on demand in calculateDependencies().
708     SmallVector<ScheduleData *, 4> MemoryDependencies;
709 
710     /// This ScheduleData is in the current scheduling region if this matches
711     /// the current SchedulingRegionID of BlockScheduling.
712     int SchedulingRegionID;
713 
714     /// Used for getting a "good" final ordering of instructions.
715     int SchedulingPriority;
716 
717     /// The number of dependencies. Constitutes of the number of users of the
718     /// instruction plus the number of dependent memory instructions (if any).
719     /// This value is calculated on demand.
720     /// If InvalidDeps, the number of dependencies is not calculated yet.
721     ///
722     int Dependencies;
723 
724     /// The number of dependencies minus the number of dependencies of scheduled
725     /// instructions. As soon as this is zero, the instruction/bundle gets ready
726     /// for scheduling.
727     /// Note that this is negative as long as Dependencies is not calculated.
728     int UnscheduledDeps;
729 
730     /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
731     /// single instructions.
732     int UnscheduledDepsInBundle;
733 
734     /// True if this instruction is scheduled (or considered as scheduled in the
735     /// dry-run).
736     bool IsScheduled;
737   };
738 
739 #ifndef NDEBUG
740   friend raw_ostream &operator<<(raw_ostream &os,
741                                  const BoUpSLP::ScheduleData &SD);
742 #endif
743 
744   /// Contains all scheduling data for a basic block.
745   ///
746   struct BlockScheduling {
747 
748     BlockScheduling(BasicBlock *BB)
749         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
750           ScheduleStart(nullptr), ScheduleEnd(nullptr),
751           FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
752           ScheduleRegionSize(0),
753           ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
754           // Make sure that the initial SchedulingRegionID is greater than the
755           // initial SchedulingRegionID in ScheduleData (which is 0).
756           SchedulingRegionID(1) {}
757 
758     void clear() {
759       ReadyInsts.clear();
760       ScheduleStart = nullptr;
761       ScheduleEnd = nullptr;
762       FirstLoadStoreInRegion = nullptr;
763       LastLoadStoreInRegion = nullptr;
764 
765       // Reduce the maximum schedule region size by the size of the
766       // previous scheduling run.
767       ScheduleRegionSizeLimit -= ScheduleRegionSize;
768       if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
769         ScheduleRegionSizeLimit = MinScheduleRegionSize;
770       ScheduleRegionSize = 0;
771 
772       // Make a new scheduling region, i.e. all existing ScheduleData is not
773       // in the new region yet.
774       ++SchedulingRegionID;
775     }
776 
777     ScheduleData *getScheduleData(Value *V) {
778       ScheduleData *SD = ScheduleDataMap[V];
779       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
780         return SD;
781       return nullptr;
782     }
783 
784     bool isInSchedulingRegion(ScheduleData *SD) {
785       return SD->SchedulingRegionID == SchedulingRegionID;
786     }
787 
788     /// Marks an instruction as scheduled and puts all dependent ready
789     /// instructions into the ready-list.
790     template <typename ReadyListType>
791     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
792       SD->IsScheduled = true;
793       DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
794 
795       ScheduleData *BundleMember = SD;
796       while (BundleMember) {
797         // Handle the def-use chain dependencies.
798         for (Use &U : BundleMember->Inst->operands()) {
799           ScheduleData *OpDef = getScheduleData(U.get());
800           if (OpDef && OpDef->hasValidDependencies() &&
801               OpDef->incrementUnscheduledDeps(-1) == 0) {
802             // There are no more unscheduled dependencies after decrementing,
803             // so we can put the dependent instruction into the ready list.
804             ScheduleData *DepBundle = OpDef->FirstInBundle;
805             assert(!DepBundle->IsScheduled &&
806                    "already scheduled bundle gets ready");
807             ReadyList.insert(DepBundle);
808             DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n");
809           }
810         }
811         // Handle the memory dependencies.
812         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
813           if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
814             // There are no more unscheduled dependencies after decrementing,
815             // so we can put the dependent instruction into the ready list.
816             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
817             assert(!DepBundle->IsScheduled &&
818                    "already scheduled bundle gets ready");
819             ReadyList.insert(DepBundle);
820             DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n");
821           }
822         }
823         BundleMember = BundleMember->NextInBundle;
824       }
825     }
826 
827     /// Put all instructions into the ReadyList which are ready for scheduling.
828     template <typename ReadyListType>
829     void initialFillReadyList(ReadyListType &ReadyList) {
830       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
831         ScheduleData *SD = getScheduleData(I);
832         if (SD->isSchedulingEntity() && SD->isReady()) {
833           ReadyList.insert(SD);
834           DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
835         }
836       }
837     }
838 
839     /// Checks if a bundle of instructions can be scheduled, i.e. has no
840     /// cyclic dependencies. This is only a dry-run, no instructions are
841     /// actually moved at this stage.
842     bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
843 
844     /// Un-bundles a group of instructions.
845     void cancelScheduling(ArrayRef<Value *> VL);
846 
847     /// Extends the scheduling region so that V is inside the region.
848     /// \returns true if the region size is within the limit.
849     bool extendSchedulingRegion(Value *V);
850 
851     /// Initialize the ScheduleData structures for new instructions in the
852     /// scheduling region.
853     void initScheduleData(Instruction *FromI, Instruction *ToI,
854                           ScheduleData *PrevLoadStore,
855                           ScheduleData *NextLoadStore);
856 
857     /// Updates the dependency information of a bundle and of all instructions/
858     /// bundles which depend on the original bundle.
859     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
860                                BoUpSLP *SLP);
861 
862     /// Sets all instruction in the scheduling region to un-scheduled.
863     void resetSchedule();
864 
865     BasicBlock *BB;
866 
867     /// Simple memory allocation for ScheduleData.
868     std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
869 
870     /// The size of a ScheduleData array in ScheduleDataChunks.
871     int ChunkSize;
872 
873     /// The allocator position in the current chunk, which is the last entry
874     /// of ScheduleDataChunks.
875     int ChunkPos;
876 
877     /// Attaches ScheduleData to Instruction.
878     /// Note that the mapping survives during all vectorization iterations, i.e.
879     /// ScheduleData structures are recycled.
880     DenseMap<Value *, ScheduleData *> ScheduleDataMap;
881 
882     struct ReadyList : SmallVector<ScheduleData *, 8> {
883       void insert(ScheduleData *SD) { push_back(SD); }
884     };
885 
886     /// The ready-list for scheduling (only used for the dry-run).
887     ReadyList ReadyInsts;
888 
889     /// The first instruction of the scheduling region.
890     Instruction *ScheduleStart;
891 
892     /// The first instruction _after_ the scheduling region.
893     Instruction *ScheduleEnd;
894 
895     /// The first memory accessing instruction in the scheduling region
896     /// (can be null).
897     ScheduleData *FirstLoadStoreInRegion;
898 
899     /// The last memory accessing instruction in the scheduling region
900     /// (can be null).
901     ScheduleData *LastLoadStoreInRegion;
902 
903     /// The current size of the scheduling region.
904     int ScheduleRegionSize;
905 
906     /// The maximum size allowed for the scheduling region.
907     int ScheduleRegionSizeLimit;
908 
909     /// The ID of the scheduling region. For a new vectorization iteration this
910     /// is incremented which "removes" all ScheduleData from the region.
911     int SchedulingRegionID;
912   };
913 
914   /// Attaches the BlockScheduling structures to basic blocks.
915   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
916 
917   /// Performs the "real" scheduling. Done before vectorization is actually
918   /// performed in a basic block.
919   void scheduleBlock(BlockScheduling *BS);
920 
921   /// List of users to ignore during scheduling and that don't need extracting.
922   ArrayRef<Value *> UserIgnoreList;
923 
924   // Number of load-bundles, which contain consecutive loads.
925   int NumLoadsWantToKeepOrder;
926 
927   // Number of load-bundles of size 2, which are consecutive loads if reversed.
928   int NumLoadsWantToChangeOrder;
929 
930   // Analysis and block reference.
931   Function *F;
932   ScalarEvolution *SE;
933   TargetTransformInfo *TTI;
934   TargetLibraryInfo *TLI;
935   AliasAnalysis *AA;
936   LoopInfo *LI;
937   DominatorTree *DT;
938   AssumptionCache *AC;
939   DemandedBits *DB;
940   const DataLayout *DL;
941   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
942   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
943   /// Instruction builder to construct the vectorized tree.
944   IRBuilder<> Builder;
945 
946   /// A map of scalar integer values to the smallest bit width with which they
947   /// can legally be represented.
948   MapVector<Value *, uint64_t> MinBWs;
949 };
950 
951 #ifndef NDEBUG
952 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
953   SD.dump(os);
954   return os;
955 }
956 #endif
957 
958 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
959                         ArrayRef<Value *> UserIgnoreLst) {
960   deleteTree();
961   UserIgnoreList = UserIgnoreLst;
962   if (!getSameType(Roots))
963     return;
964   buildTree_rec(Roots, 0);
965 
966   // Collect the values that we need to extract from the tree.
967   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
968     TreeEntry *Entry = &VectorizableTree[EIdx];
969 
970     // For each lane:
971     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
972       Value *Scalar = Entry->Scalars[Lane];
973 
974       // No need to handle users of gathered values.
975       if (Entry->NeedToGather)
976         continue;
977 
978       for (User *U : Scalar->users()) {
979         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
980 
981         Instruction *UserInst = dyn_cast<Instruction>(U);
982         if (!UserInst)
983           continue;
984 
985         // Skip in-tree scalars that become vectors
986         if (ScalarToTreeEntry.count(U)) {
987           int Idx = ScalarToTreeEntry[U];
988           TreeEntry *UseEntry = &VectorizableTree[Idx];
989           Value *UseScalar = UseEntry->Scalars[0];
990           // Some in-tree scalars will remain as scalar in vectorized
991           // instructions. If that is the case, the one in Lane 0 will
992           // be used.
993           if (UseScalar != U ||
994               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
995             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
996                          << ".\n");
997             assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
998             continue;
999           }
1000         }
1001 
1002         // Ignore users in the user ignore list.
1003         if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
1004             UserIgnoreList.end())
1005           continue;
1006 
1007         DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
1008               Lane << " from " << *Scalar << ".\n");
1009         ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
1010       }
1011     }
1012   }
1013 }
1014 
1015 
1016 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
1017   bool SameTy = getSameType(VL); (void)SameTy;
1018   bool isAltShuffle = false;
1019   assert(SameTy && "Invalid types!");
1020 
1021   if (Depth == RecursionMaxDepth) {
1022     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
1023     newTreeEntry(VL, false);
1024     return;
1025   }
1026 
1027   // Don't handle vectors.
1028   if (VL[0]->getType()->isVectorTy()) {
1029     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
1030     newTreeEntry(VL, false);
1031     return;
1032   }
1033 
1034   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1035     if (SI->getValueOperand()->getType()->isVectorTy()) {
1036       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
1037       newTreeEntry(VL, false);
1038       return;
1039     }
1040   unsigned Opcode = getSameOpcode(VL);
1041 
1042   // Check that this shuffle vector refers to the alternate
1043   // sequence of opcodes.
1044   if (Opcode == Instruction::ShuffleVector) {
1045     Instruction *I0 = dyn_cast<Instruction>(VL[0]);
1046     unsigned Op = I0->getOpcode();
1047     if (Op != Instruction::ShuffleVector)
1048       isAltShuffle = true;
1049   }
1050 
1051   // If all of the operands are identical or constant we have a simple solution.
1052   if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
1053     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
1054     newTreeEntry(VL, false);
1055     return;
1056   }
1057 
1058   // We now know that this is a vector of instructions of the same type from
1059   // the same block.
1060 
1061   // Don't vectorize ephemeral values.
1062   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1063     if (EphValues.count(VL[i])) {
1064       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1065             ") is ephemeral.\n");
1066       newTreeEntry(VL, false);
1067       return;
1068     }
1069   }
1070 
1071   // Check if this is a duplicate of another entry.
1072   if (ScalarToTreeEntry.count(VL[0])) {
1073     int Idx = ScalarToTreeEntry[VL[0]];
1074     TreeEntry *E = &VectorizableTree[Idx];
1075     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1076       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1077       if (E->Scalars[i] != VL[i]) {
1078         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1079         newTreeEntry(VL, false);
1080         return;
1081       }
1082     }
1083     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1084     return;
1085   }
1086 
1087   // Check that none of the instructions in the bundle are already in the tree.
1088   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1089     if (ScalarToTreeEntry.count(VL[i])) {
1090       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1091             ") is already in tree.\n");
1092       newTreeEntry(VL, false);
1093       return;
1094     }
1095   }
1096 
1097   // If any of the scalars is marked as a value that needs to stay scalar then
1098   // we need to gather the scalars.
1099   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1100     if (MustGather.count(VL[i])) {
1101       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1102       newTreeEntry(VL, false);
1103       return;
1104     }
1105   }
1106 
1107   // Check that all of the users of the scalars that we want to vectorize are
1108   // schedulable.
1109   Instruction *VL0 = cast<Instruction>(VL[0]);
1110   BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1111 
1112   if (!DT->isReachableFromEntry(BB)) {
1113     // Don't go into unreachable blocks. They may contain instructions with
1114     // dependency cycles which confuse the final scheduling.
1115     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1116     newTreeEntry(VL, false);
1117     return;
1118   }
1119 
1120   // Check that every instructions appears once in this bundle.
1121   for (unsigned i = 0, e = VL.size(); i < e; ++i)
1122     for (unsigned j = i+1; j < e; ++j)
1123       if (VL[i] == VL[j]) {
1124         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1125         newTreeEntry(VL, false);
1126         return;
1127       }
1128 
1129   auto &BSRef = BlocksSchedules[BB];
1130   if (!BSRef) {
1131     BSRef = llvm::make_unique<BlockScheduling>(BB);
1132   }
1133   BlockScheduling &BS = *BSRef.get();
1134 
1135   if (!BS.tryScheduleBundle(VL, this)) {
1136     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1137     assert((!BS.getScheduleData(VL[0]) ||
1138             !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
1139            "tryScheduleBundle should cancelScheduling on failure");
1140     newTreeEntry(VL, false);
1141     return;
1142   }
1143   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1144 
1145   switch (Opcode) {
1146     case Instruction::PHI: {
1147       PHINode *PH = dyn_cast<PHINode>(VL0);
1148 
1149       // Check for terminator values (e.g. invoke).
1150       for (unsigned j = 0; j < VL.size(); ++j)
1151         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1152           TerminatorInst *Term = dyn_cast<TerminatorInst>(
1153               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1154           if (Term) {
1155             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1156             BS.cancelScheduling(VL);
1157             newTreeEntry(VL, false);
1158             return;
1159           }
1160         }
1161 
1162       newTreeEntry(VL, true);
1163       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1164 
1165       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1166         ValueList Operands;
1167         // Prepare the operand vector.
1168         for (unsigned j = 0; j < VL.size(); ++j)
1169           Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1170               PH->getIncomingBlock(i)));
1171 
1172         buildTree_rec(Operands, Depth + 1);
1173       }
1174       return;
1175     }
1176     case Instruction::ExtractValue:
1177     case Instruction::ExtractElement: {
1178       bool Reuse = canReuseExtract(VL, Opcode);
1179       if (Reuse) {
1180         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1181       } else {
1182         BS.cancelScheduling(VL);
1183       }
1184       newTreeEntry(VL, Reuse);
1185       return;
1186     }
1187     case Instruction::Load: {
1188       // Check that a vectorized load would load the same memory as a scalar
1189       // load.
1190       // For example we don't want vectorize loads that are smaller than 8 bit.
1191       // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
1192       // loading/storing it as an i8 struct. If we vectorize loads/stores from
1193       // such a struct we read/write packed bits disagreeing with the
1194       // unvectorized version.
1195       Type *ScalarTy = VL[0]->getType();
1196 
1197       if (DL->getTypeSizeInBits(ScalarTy) !=
1198           DL->getTypeAllocSizeInBits(ScalarTy)) {
1199         BS.cancelScheduling(VL);
1200         newTreeEntry(VL, false);
1201         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
1202         return;
1203       }
1204       // Check if the loads are consecutive or of we need to swizzle them.
1205       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1206         LoadInst *L = cast<LoadInst>(VL[i]);
1207         if (!L->isSimple()) {
1208           BS.cancelScheduling(VL);
1209           newTreeEntry(VL, false);
1210           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1211           return;
1212         }
1213 
1214         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
1215           if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], *DL, *SE)) {
1216             ++NumLoadsWantToChangeOrder;
1217           }
1218           BS.cancelScheduling(VL);
1219           newTreeEntry(VL, false);
1220           DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1221           return;
1222         }
1223       }
1224       ++NumLoadsWantToKeepOrder;
1225       newTreeEntry(VL, true);
1226       DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1227       return;
1228     }
1229     case Instruction::ZExt:
1230     case Instruction::SExt:
1231     case Instruction::FPToUI:
1232     case Instruction::FPToSI:
1233     case Instruction::FPExt:
1234     case Instruction::PtrToInt:
1235     case Instruction::IntToPtr:
1236     case Instruction::SIToFP:
1237     case Instruction::UIToFP:
1238     case Instruction::Trunc:
1239     case Instruction::FPTrunc:
1240     case Instruction::BitCast: {
1241       Type *SrcTy = VL0->getOperand(0)->getType();
1242       for (unsigned i = 0; i < VL.size(); ++i) {
1243         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1244         if (Ty != SrcTy || !isValidElementType(Ty)) {
1245           BS.cancelScheduling(VL);
1246           newTreeEntry(VL, false);
1247           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1248           return;
1249         }
1250       }
1251       newTreeEntry(VL, true);
1252       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1253 
1254       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1255         ValueList Operands;
1256         // Prepare the operand vector.
1257         for (unsigned j = 0; j < VL.size(); ++j)
1258           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1259 
1260         buildTree_rec(Operands, Depth+1);
1261       }
1262       return;
1263     }
1264     case Instruction::ICmp:
1265     case Instruction::FCmp: {
1266       // Check that all of the compares have the same predicate.
1267       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
1268       Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1269       for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1270         CmpInst *Cmp = cast<CmpInst>(VL[i]);
1271         if (Cmp->getPredicate() != P0 ||
1272             Cmp->getOperand(0)->getType() != ComparedTy) {
1273           BS.cancelScheduling(VL);
1274           newTreeEntry(VL, false);
1275           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1276           return;
1277         }
1278       }
1279 
1280       newTreeEntry(VL, true);
1281       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1282 
1283       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1284         ValueList Operands;
1285         // Prepare the operand vector.
1286         for (unsigned j = 0; j < VL.size(); ++j)
1287           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1288 
1289         buildTree_rec(Operands, Depth+1);
1290       }
1291       return;
1292     }
1293     case Instruction::Select:
1294     case Instruction::Add:
1295     case Instruction::FAdd:
1296     case Instruction::Sub:
1297     case Instruction::FSub:
1298     case Instruction::Mul:
1299     case Instruction::FMul:
1300     case Instruction::UDiv:
1301     case Instruction::SDiv:
1302     case Instruction::FDiv:
1303     case Instruction::URem:
1304     case Instruction::SRem:
1305     case Instruction::FRem:
1306     case Instruction::Shl:
1307     case Instruction::LShr:
1308     case Instruction::AShr:
1309     case Instruction::And:
1310     case Instruction::Or:
1311     case Instruction::Xor: {
1312       newTreeEntry(VL, true);
1313       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1314 
1315       // Sort operands of the instructions so that each side is more likely to
1316       // have the same opcode.
1317       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1318         ValueList Left, Right;
1319         reorderInputsAccordingToOpcode(VL, Left, Right);
1320         buildTree_rec(Left, Depth + 1);
1321         buildTree_rec(Right, Depth + 1);
1322         return;
1323       }
1324 
1325       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1326         ValueList Operands;
1327         // Prepare the operand vector.
1328         for (unsigned j = 0; j < VL.size(); ++j)
1329           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1330 
1331         buildTree_rec(Operands, Depth+1);
1332       }
1333       return;
1334     }
1335     case Instruction::GetElementPtr: {
1336       // We don't combine GEPs with complicated (nested) indexing.
1337       for (unsigned j = 0; j < VL.size(); ++j) {
1338         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1339           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1340           BS.cancelScheduling(VL);
1341           newTreeEntry(VL, false);
1342           return;
1343         }
1344       }
1345 
1346       // We can't combine several GEPs into one vector if they operate on
1347       // different types.
1348       Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1349       for (unsigned j = 0; j < VL.size(); ++j) {
1350         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1351         if (Ty0 != CurTy) {
1352           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1353           BS.cancelScheduling(VL);
1354           newTreeEntry(VL, false);
1355           return;
1356         }
1357       }
1358 
1359       // We don't combine GEPs with non-constant indexes.
1360       for (unsigned j = 0; j < VL.size(); ++j) {
1361         auto Op = cast<Instruction>(VL[j])->getOperand(1);
1362         if (!isa<ConstantInt>(Op)) {
1363           DEBUG(
1364               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1365           BS.cancelScheduling(VL);
1366           newTreeEntry(VL, false);
1367           return;
1368         }
1369       }
1370 
1371       newTreeEntry(VL, true);
1372       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1373       for (unsigned i = 0, e = 2; i < e; ++i) {
1374         ValueList Operands;
1375         // Prepare the operand vector.
1376         for (unsigned j = 0; j < VL.size(); ++j)
1377           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1378 
1379         buildTree_rec(Operands, Depth + 1);
1380       }
1381       return;
1382     }
1383     case Instruction::Store: {
1384       // Check if the stores are consecutive or of we need to swizzle them.
1385       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1386         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
1387           BS.cancelScheduling(VL);
1388           newTreeEntry(VL, false);
1389           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1390           return;
1391         }
1392 
1393       newTreeEntry(VL, true);
1394       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1395 
1396       ValueList Operands;
1397       for (unsigned j = 0; j < VL.size(); ++j)
1398         Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1399 
1400       buildTree_rec(Operands, Depth + 1);
1401       return;
1402     }
1403     case Instruction::Call: {
1404       // Check if the calls are all to the same vectorizable intrinsic.
1405       CallInst *CI = cast<CallInst>(VL[0]);
1406       // Check if this is an Intrinsic call or something that can be
1407       // represented by an intrinsic call
1408       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1409       if (!isTriviallyVectorizable(ID)) {
1410         BS.cancelScheduling(VL);
1411         newTreeEntry(VL, false);
1412         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1413         return;
1414       }
1415       Function *Int = CI->getCalledFunction();
1416       Value *A1I = nullptr;
1417       if (hasVectorInstrinsicScalarOpd(ID, 1))
1418         A1I = CI->getArgOperand(1);
1419       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1420         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1421         if (!CI2 || CI2->getCalledFunction() != Int ||
1422             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
1423             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
1424           BS.cancelScheduling(VL);
1425           newTreeEntry(VL, false);
1426           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1427                        << "\n");
1428           return;
1429         }
1430         // ctlz,cttz and powi are special intrinsics whose second argument
1431         // should be same in order for them to be vectorized.
1432         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1433           Value *A1J = CI2->getArgOperand(1);
1434           if (A1I != A1J) {
1435             BS.cancelScheduling(VL);
1436             newTreeEntry(VL, false);
1437             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1438                          << " argument "<< A1I<<"!=" << A1J
1439                          << "\n");
1440             return;
1441           }
1442         }
1443         // Verify that the bundle operands are identical between the two calls.
1444         if (CI->hasOperandBundles() &&
1445             !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
1446                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
1447                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
1448           BS.cancelScheduling(VL);
1449           newTreeEntry(VL, false);
1450           DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
1451                        << *VL[i] << '\n');
1452           return;
1453         }
1454       }
1455 
1456       newTreeEntry(VL, true);
1457       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1458         ValueList Operands;
1459         // Prepare the operand vector.
1460         for (unsigned j = 0; j < VL.size(); ++j) {
1461           CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
1462           Operands.push_back(CI2->getArgOperand(i));
1463         }
1464         buildTree_rec(Operands, Depth + 1);
1465       }
1466       return;
1467     }
1468     case Instruction::ShuffleVector: {
1469       // If this is not an alternate sequence of opcode like add-sub
1470       // then do not vectorize this instruction.
1471       if (!isAltShuffle) {
1472         BS.cancelScheduling(VL);
1473         newTreeEntry(VL, false);
1474         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1475         return;
1476       }
1477       newTreeEntry(VL, true);
1478       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1479 
1480       // Reorder operands if reordering would enable vectorization.
1481       if (isa<BinaryOperator>(VL0)) {
1482         ValueList Left, Right;
1483         reorderAltShuffleOperands(VL, Left, Right);
1484         buildTree_rec(Left, Depth + 1);
1485         buildTree_rec(Right, Depth + 1);
1486         return;
1487       }
1488 
1489       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1490         ValueList Operands;
1491         // Prepare the operand vector.
1492         for (unsigned j = 0; j < VL.size(); ++j)
1493           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1494 
1495         buildTree_rec(Operands, Depth + 1);
1496       }
1497       return;
1498     }
1499     default:
1500       BS.cancelScheduling(VL);
1501       newTreeEntry(VL, false);
1502       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1503       return;
1504   }
1505 }
1506 
1507 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
1508   unsigned N;
1509   Type *EltTy;
1510   auto *ST = dyn_cast<StructType>(T);
1511   if (ST) {
1512     N = ST->getNumElements();
1513     EltTy = *ST->element_begin();
1514   } else {
1515     N = cast<ArrayType>(T)->getNumElements();
1516     EltTy = cast<ArrayType>(T)->getElementType();
1517   }
1518   if (!isValidElementType(EltTy))
1519     return 0;
1520   uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
1521   if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
1522     return 0;
1523   if (ST) {
1524     // Check that struct is homogeneous.
1525     for (const auto *Ty : ST->elements())
1526       if (Ty != EltTy)
1527         return 0;
1528   }
1529   return N;
1530 }
1531 
1532 bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
1533   assert(Opcode == Instruction::ExtractElement ||
1534          Opcode == Instruction::ExtractValue);
1535   assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
1536   // Check if all of the extracts come from the same vector and from the
1537   // correct offset.
1538   Value *VL0 = VL[0];
1539   Instruction *E0 = cast<Instruction>(VL0);
1540   Value *Vec = E0->getOperand(0);
1541 
1542   // We have to extract from a vector/aggregate with the same number of elements.
1543   unsigned NElts;
1544   if (Opcode == Instruction::ExtractValue) {
1545     const DataLayout &DL = E0->getModule()->getDataLayout();
1546     NElts = canMapToVector(Vec->getType(), DL);
1547     if (!NElts)
1548       return false;
1549     // Check if load can be rewritten as load of vector.
1550     LoadInst *LI = dyn_cast<LoadInst>(Vec);
1551     if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
1552       return false;
1553   } else {
1554     NElts = Vec->getType()->getVectorNumElements();
1555   }
1556 
1557   if (NElts != VL.size())
1558     return false;
1559 
1560   // Check that all of the indices extract from the correct offset.
1561   if (!matchExtractIndex(E0, 0, Opcode))
1562     return false;
1563 
1564   for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1565     Instruction *E = cast<Instruction>(VL[i]);
1566     if (!matchExtractIndex(E, i, Opcode))
1567       return false;
1568     if (E->getOperand(0) != Vec)
1569       return false;
1570   }
1571 
1572   return true;
1573 }
1574 
1575 int BoUpSLP::getEntryCost(TreeEntry *E) {
1576   ArrayRef<Value*> VL = E->Scalars;
1577 
1578   Type *ScalarTy = VL[0]->getType();
1579   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1580     ScalarTy = SI->getValueOperand()->getType();
1581   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1582 
1583   // If we have computed a smaller type for the expression, update VecTy so
1584   // that the costs will be accurate.
1585   if (MinBWs.count(VL[0]))
1586     VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]),
1587                             VL.size());
1588 
1589   if (E->NeedToGather) {
1590     if (allConstant(VL))
1591       return 0;
1592     if (isSplat(VL)) {
1593       return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1594     }
1595     return getGatherCost(E->Scalars);
1596   }
1597   unsigned Opcode = getSameOpcode(VL);
1598   assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
1599   Instruction *VL0 = cast<Instruction>(VL[0]);
1600   switch (Opcode) {
1601     case Instruction::PHI: {
1602       return 0;
1603     }
1604     case Instruction::ExtractValue:
1605     case Instruction::ExtractElement: {
1606       if (canReuseExtract(VL, Opcode)) {
1607         int DeadCost = 0;
1608         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1609           Instruction *E = cast<Instruction>(VL[i]);
1610           if (E->hasOneUse())
1611             // Take credit for instruction that will become dead.
1612             DeadCost +=
1613                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1614         }
1615         return -DeadCost;
1616       }
1617       return getGatherCost(VecTy);
1618     }
1619     case Instruction::ZExt:
1620     case Instruction::SExt:
1621     case Instruction::FPToUI:
1622     case Instruction::FPToSI:
1623     case Instruction::FPExt:
1624     case Instruction::PtrToInt:
1625     case Instruction::IntToPtr:
1626     case Instruction::SIToFP:
1627     case Instruction::UIToFP:
1628     case Instruction::Trunc:
1629     case Instruction::FPTrunc:
1630     case Instruction::BitCast: {
1631       Type *SrcTy = VL0->getOperand(0)->getType();
1632 
1633       // Calculate the cost of this instruction.
1634       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1635                                                          VL0->getType(), SrcTy);
1636 
1637       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1638       int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1639       return VecCost - ScalarCost;
1640     }
1641     case Instruction::FCmp:
1642     case Instruction::ICmp:
1643     case Instruction::Select: {
1644       // Calculate the cost of this instruction.
1645       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1646       int ScalarCost = VecTy->getNumElements() *
1647           TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1648       int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1649       return VecCost - ScalarCost;
1650     }
1651     case Instruction::Add:
1652     case Instruction::FAdd:
1653     case Instruction::Sub:
1654     case Instruction::FSub:
1655     case Instruction::Mul:
1656     case Instruction::FMul:
1657     case Instruction::UDiv:
1658     case Instruction::SDiv:
1659     case Instruction::FDiv:
1660     case Instruction::URem:
1661     case Instruction::SRem:
1662     case Instruction::FRem:
1663     case Instruction::Shl:
1664     case Instruction::LShr:
1665     case Instruction::AShr:
1666     case Instruction::And:
1667     case Instruction::Or:
1668     case Instruction::Xor: {
1669       // Certain instructions can be cheaper to vectorize if they have a
1670       // constant second vector operand.
1671       TargetTransformInfo::OperandValueKind Op1VK =
1672           TargetTransformInfo::OK_AnyValue;
1673       TargetTransformInfo::OperandValueKind Op2VK =
1674           TargetTransformInfo::OK_UniformConstantValue;
1675       TargetTransformInfo::OperandValueProperties Op1VP =
1676           TargetTransformInfo::OP_None;
1677       TargetTransformInfo::OperandValueProperties Op2VP =
1678           TargetTransformInfo::OP_None;
1679 
1680       // If all operands are exactly the same ConstantInt then set the
1681       // operand kind to OK_UniformConstantValue.
1682       // If instead not all operands are constants, then set the operand kind
1683       // to OK_AnyValue. If all operands are constants but not the same,
1684       // then set the operand kind to OK_NonUniformConstantValue.
1685       ConstantInt *CInt = nullptr;
1686       for (unsigned i = 0; i < VL.size(); ++i) {
1687         const Instruction *I = cast<Instruction>(VL[i]);
1688         if (!isa<ConstantInt>(I->getOperand(1))) {
1689           Op2VK = TargetTransformInfo::OK_AnyValue;
1690           break;
1691         }
1692         if (i == 0) {
1693           CInt = cast<ConstantInt>(I->getOperand(1));
1694           continue;
1695         }
1696         if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
1697             CInt != cast<ConstantInt>(I->getOperand(1)))
1698           Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1699       }
1700       // FIXME: Currently cost of model modification for division by power of
1701       // 2 is handled for X86 and AArch64. Add support for other targets.
1702       if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1703           CInt->getValue().isPowerOf2())
1704         Op2VP = TargetTransformInfo::OP_PowerOf2;
1705 
1706       int ScalarCost = VecTy->getNumElements() *
1707                        TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
1708                                                    Op2VK, Op1VP, Op2VP);
1709       int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1710                                                 Op1VP, Op2VP);
1711       return VecCost - ScalarCost;
1712     }
1713     case Instruction::GetElementPtr: {
1714       TargetTransformInfo::OperandValueKind Op1VK =
1715           TargetTransformInfo::OK_AnyValue;
1716       TargetTransformInfo::OperandValueKind Op2VK =
1717           TargetTransformInfo::OK_UniformConstantValue;
1718 
1719       int ScalarCost =
1720           VecTy->getNumElements() *
1721           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1722       int VecCost =
1723           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1724 
1725       return VecCost - ScalarCost;
1726     }
1727     case Instruction::Load: {
1728       // Cost of wide load - cost of scalar loads.
1729       unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
1730       int ScalarLdCost = VecTy->getNumElements() *
1731             TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
1732       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
1733                                            VecTy, alignment, 0);
1734       return VecLdCost - ScalarLdCost;
1735     }
1736     case Instruction::Store: {
1737       // We know that we can merge the stores. Calculate the cost.
1738       unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
1739       int ScalarStCost = VecTy->getNumElements() *
1740             TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
1741       int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
1742                                            VecTy, alignment, 0);
1743       return VecStCost - ScalarStCost;
1744     }
1745     case Instruction::Call: {
1746       CallInst *CI = cast<CallInst>(VL0);
1747       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1748 
1749       // Calculate the cost of the scalar and vector calls.
1750       SmallVector<Type*, 4> ScalarTys, VecTys;
1751       for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1752         ScalarTys.push_back(CI->getArgOperand(op)->getType());
1753         VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
1754                                          VecTy->getNumElements()));
1755       }
1756 
1757       FastMathFlags FMF;
1758       if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
1759         FMF = FPMO->getFastMathFlags();
1760 
1761       int ScalarCallCost = VecTy->getNumElements() *
1762           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
1763 
1764       int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
1765 
1766       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1767             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
1768             << " for " << *CI << "\n");
1769 
1770       return VecCallCost - ScalarCallCost;
1771     }
1772     case Instruction::ShuffleVector: {
1773       TargetTransformInfo::OperandValueKind Op1VK =
1774           TargetTransformInfo::OK_AnyValue;
1775       TargetTransformInfo::OperandValueKind Op2VK =
1776           TargetTransformInfo::OK_AnyValue;
1777       int ScalarCost = 0;
1778       int VecCost = 0;
1779       for (unsigned i = 0; i < VL.size(); ++i) {
1780         Instruction *I = cast<Instruction>(VL[i]);
1781         if (!I)
1782           break;
1783         ScalarCost +=
1784             TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1785       }
1786       // VecCost is equal to sum of the cost of creating 2 vectors
1787       // and the cost of creating shuffle.
1788       Instruction *I0 = cast<Instruction>(VL[0]);
1789       VecCost =
1790           TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1791       Instruction *I1 = cast<Instruction>(VL[1]);
1792       VecCost +=
1793           TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1794       VecCost +=
1795           TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1796       return VecCost - ScalarCost;
1797     }
1798     default:
1799       llvm_unreachable("Unknown instruction");
1800   }
1801 }
1802 
1803 bool BoUpSLP::isFullyVectorizableTinyTree() {
1804   DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1805         VectorizableTree.size() << " is fully vectorizable .\n");
1806 
1807   // We only handle trees of height 2.
1808   if (VectorizableTree.size() != 2)
1809     return false;
1810 
1811   // Handle splat and all-constants stores.
1812   if (!VectorizableTree[0].NeedToGather &&
1813       (allConstant(VectorizableTree[1].Scalars) ||
1814        isSplat(VectorizableTree[1].Scalars)))
1815     return true;
1816 
1817   // Gathering cost would be too much for tiny trees.
1818   if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1819     return false;
1820 
1821   return true;
1822 }
1823 
1824 int BoUpSLP::getSpillCost() {
1825   // Walk from the bottom of the tree to the top, tracking which values are
1826   // live. When we see a call instruction that is not part of our tree,
1827   // query TTI to see if there is a cost to keeping values live over it
1828   // (for example, if spills and fills are required).
1829   unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1830   int Cost = 0;
1831 
1832   SmallPtrSet<Instruction*, 4> LiveValues;
1833   Instruction *PrevInst = nullptr;
1834 
1835   for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
1836     Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
1837     if (!Inst)
1838       continue;
1839 
1840     if (!PrevInst) {
1841       PrevInst = Inst;
1842       continue;
1843     }
1844 
1845     // Update LiveValues.
1846     LiveValues.erase(PrevInst);
1847     for (auto &J : PrevInst->operands()) {
1848       if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1849         LiveValues.insert(cast<Instruction>(&*J));
1850     }
1851 
1852     DEBUG(
1853       dbgs() << "SLP: #LV: " << LiveValues.size();
1854       for (auto *X : LiveValues)
1855         dbgs() << " " << X->getName();
1856       dbgs() << ", Looking at ";
1857       Inst->dump();
1858       );
1859 
1860     // Now find the sequence of instructions between PrevInst and Inst.
1861     BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
1862         PrevInstIt(PrevInst->getIterator());
1863     --PrevInstIt;
1864     while (InstIt != PrevInstIt) {
1865       if (PrevInstIt == PrevInst->getParent()->rend()) {
1866         PrevInstIt = Inst->getParent()->rbegin();
1867         continue;
1868       }
1869 
1870       if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1871         SmallVector<Type*, 4> V;
1872         for (auto *II : LiveValues)
1873           V.push_back(VectorType::get(II->getType(), BundleWidth));
1874         Cost += TTI->getCostOfKeepingLiveOverCall(V);
1875       }
1876 
1877       ++PrevInstIt;
1878     }
1879 
1880     PrevInst = Inst;
1881   }
1882 
1883   return Cost;
1884 }
1885 
1886 int BoUpSLP::getTreeCost() {
1887   int Cost = 0;
1888   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1889         VectorizableTree.size() << ".\n");
1890 
1891   // We only vectorize tiny trees if it is fully vectorizable.
1892   if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1893     if (VectorizableTree.empty()) {
1894       assert(!ExternalUses.size() && "We should not have any external users");
1895     }
1896     return INT_MAX;
1897   }
1898 
1899   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1900 
1901   for (TreeEntry &TE : VectorizableTree) {
1902     int C = getEntryCost(&TE);
1903     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1904                  << *TE.Scalars[0] << ".\n");
1905     Cost += C;
1906   }
1907 
1908   SmallSet<Value *, 16> ExtractCostCalculated;
1909   int ExtractCost = 0;
1910   for (ExternalUser &EU : ExternalUses) {
1911     // We only add extract cost once for the same scalar.
1912     if (!ExtractCostCalculated.insert(EU.Scalar).second)
1913       continue;
1914 
1915     // Uses by ephemeral values are free (because the ephemeral value will be
1916     // removed prior to code generation, and so the extraction will be
1917     // removed as well).
1918     if (EphValues.count(EU.User))
1919       continue;
1920 
1921     // If we plan to rewrite the tree in a smaller type, we will need to sign
1922     // extend the extracted value back to the original type. Here, we account
1923     // for the extract and the added cost of the sign extend if needed.
1924     auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
1925     auto *ScalarRoot = VectorizableTree[0].Scalars[0];
1926     if (MinBWs.count(ScalarRoot)) {
1927       auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
1928       VecTy = VectorType::get(MinTy, BundleWidth);
1929       ExtractCost += TTI->getExtractWithExtendCost(
1930           Instruction::SExt, EU.Scalar->getType(), VecTy, EU.Lane);
1931     } else {
1932       ExtractCost +=
1933           TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
1934     }
1935   }
1936 
1937   int SpillCost = getSpillCost();
1938   Cost += SpillCost + ExtractCost;
1939 
1940   DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
1941                << "SLP: Extract Cost = " << ExtractCost << ".\n"
1942                << "SLP: Total Cost = " << Cost << ".\n");
1943   return Cost;
1944 }
1945 
1946 int BoUpSLP::getGatherCost(Type *Ty) {
1947   int Cost = 0;
1948   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1949     Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1950   return Cost;
1951 }
1952 
1953 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1954   // Find the type of the operands in VL.
1955   Type *ScalarTy = VL[0]->getType();
1956   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1957     ScalarTy = SI->getValueOperand()->getType();
1958   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1959   // Find the cost of inserting/extracting values from the vector.
1960   return getGatherCost(VecTy);
1961 }
1962 
1963 // Reorder commutative operations in alternate shuffle if the resulting vectors
1964 // are consecutive loads. This would allow us to vectorize the tree.
1965 // If we have something like-
1966 // load a[0] - load b[0]
1967 // load b[1] + load a[1]
1968 // load a[2] - load b[2]
1969 // load a[3] + load b[3]
1970 // Reordering the second load b[1]  load a[1] would allow us to vectorize this
1971 // code.
1972 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
1973                                         SmallVectorImpl<Value *> &Left,
1974                                         SmallVectorImpl<Value *> &Right) {
1975   // Push left and right operands of binary operation into Left and Right
1976   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1977     Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
1978     Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
1979   }
1980 
1981   // Reorder if we have a commutative operation and consecutive access
1982   // are on either side of the alternate instructions.
1983   for (unsigned j = 0; j < VL.size() - 1; ++j) {
1984     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
1985       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
1986         Instruction *VL1 = cast<Instruction>(VL[j]);
1987         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1988         if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
1989           std::swap(Left[j], Right[j]);
1990           continue;
1991         } else if (VL2->isCommutative() &&
1992                    isConsecutiveAccess(L, L1, *DL, *SE)) {
1993           std::swap(Left[j + 1], Right[j + 1]);
1994           continue;
1995         }
1996         // else unchanged
1997       }
1998     }
1999     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2000       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2001         Instruction *VL1 = cast<Instruction>(VL[j]);
2002         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
2003         if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
2004           std::swap(Left[j], Right[j]);
2005           continue;
2006         } else if (VL2->isCommutative() &&
2007                    isConsecutiveAccess(L, L1, *DL, *SE)) {
2008           std::swap(Left[j + 1], Right[j + 1]);
2009           continue;
2010         }
2011         // else unchanged
2012       }
2013     }
2014   }
2015 }
2016 
2017 // Return true if I should be commuted before adding it's left and right
2018 // operands to the arrays Left and Right.
2019 //
2020 // The vectorizer is trying to either have all elements one side being
2021 // instruction with the same opcode to enable further vectorization, or having
2022 // a splat to lower the vectorizing cost.
2023 static bool shouldReorderOperands(int i, Instruction &I,
2024                                   SmallVectorImpl<Value *> &Left,
2025                                   SmallVectorImpl<Value *> &Right,
2026                                   bool AllSameOpcodeLeft,
2027                                   bool AllSameOpcodeRight, bool SplatLeft,
2028                                   bool SplatRight) {
2029   Value *VLeft = I.getOperand(0);
2030   Value *VRight = I.getOperand(1);
2031   // If we have "SplatRight", try to see if commuting is needed to preserve it.
2032   if (SplatRight) {
2033     if (VRight == Right[i - 1])
2034       // Preserve SplatRight
2035       return false;
2036     if (VLeft == Right[i - 1]) {
2037       // Commuting would preserve SplatRight, but we don't want to break
2038       // SplatLeft either, i.e. preserve the original order if possible.
2039       // (FIXME: why do we care?)
2040       if (SplatLeft && VLeft == Left[i - 1])
2041         return false;
2042       return true;
2043     }
2044   }
2045   // Symmetrically handle Right side.
2046   if (SplatLeft) {
2047     if (VLeft == Left[i - 1])
2048       // Preserve SplatLeft
2049       return false;
2050     if (VRight == Left[i - 1])
2051       return true;
2052   }
2053 
2054   Instruction *ILeft = dyn_cast<Instruction>(VLeft);
2055   Instruction *IRight = dyn_cast<Instruction>(VRight);
2056 
2057   // If we have "AllSameOpcodeRight", try to see if the left operands preserves
2058   // it and not the right, in this case we want to commute.
2059   if (AllSameOpcodeRight) {
2060     unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
2061     if (IRight && RightPrevOpcode == IRight->getOpcode())
2062       // Do not commute, a match on the right preserves AllSameOpcodeRight
2063       return false;
2064     if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
2065       // We have a match and may want to commute, but first check if there is
2066       // not also a match on the existing operands on the Left to preserve
2067       // AllSameOpcodeLeft, i.e. preserve the original order if possible.
2068       // (FIXME: why do we care?)
2069       if (AllSameOpcodeLeft && ILeft &&
2070           cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
2071         return false;
2072       return true;
2073     }
2074   }
2075   // Symmetrically handle Left side.
2076   if (AllSameOpcodeLeft) {
2077     unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
2078     if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
2079       return false;
2080     if (IRight && LeftPrevOpcode == IRight->getOpcode())
2081       return true;
2082   }
2083   return false;
2084 }
2085 
2086 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2087                                              SmallVectorImpl<Value *> &Left,
2088                                              SmallVectorImpl<Value *> &Right) {
2089 
2090   if (VL.size()) {
2091     // Peel the first iteration out of the loop since there's nothing
2092     // interesting to do anyway and it simplifies the checks in the loop.
2093     auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
2094     auto VRight = cast<Instruction>(VL[0])->getOperand(1);
2095     if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
2096       // Favor having instruction to the right. FIXME: why?
2097       std::swap(VLeft, VRight);
2098     Left.push_back(VLeft);
2099     Right.push_back(VRight);
2100   }
2101 
2102   // Keep track if we have instructions with all the same opcode on one side.
2103   bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
2104   bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
2105   // Keep track if we have one side with all the same value (broadcast).
2106   bool SplatLeft = true;
2107   bool SplatRight = true;
2108 
2109   for (unsigned i = 1, e = VL.size(); i != e; ++i) {
2110     Instruction *I = cast<Instruction>(VL[i]);
2111     assert(I->isCommutative() && "Can only process commutative instruction");
2112     // Commute to favor either a splat or maximizing having the same opcodes on
2113     // one side.
2114     if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
2115                               AllSameOpcodeRight, SplatLeft, SplatRight)) {
2116       Left.push_back(I->getOperand(1));
2117       Right.push_back(I->getOperand(0));
2118     } else {
2119       Left.push_back(I->getOperand(0));
2120       Right.push_back(I->getOperand(1));
2121     }
2122     // Update Splat* and AllSameOpcode* after the insertion.
2123     SplatRight = SplatRight && (Right[i - 1] == Right[i]);
2124     SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
2125     AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
2126                         (cast<Instruction>(Left[i - 1])->getOpcode() ==
2127                          cast<Instruction>(Left[i])->getOpcode());
2128     AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
2129                          (cast<Instruction>(Right[i - 1])->getOpcode() ==
2130                           cast<Instruction>(Right[i])->getOpcode());
2131   }
2132 
2133   // If one operand end up being broadcast, return this operand order.
2134   if (SplatRight || SplatLeft)
2135     return;
2136 
2137   // Finally check if we can get longer vectorizable chain by reordering
2138   // without breaking the good operand order detected above.
2139   // E.g. If we have something like-
2140   // load a[0]  load b[0]
2141   // load b[1]  load a[1]
2142   // load a[2]  load b[2]
2143   // load a[3]  load b[3]
2144   // Reordering the second load b[1]  load a[1] would allow us to vectorize
2145   // this code and we still retain AllSameOpcode property.
2146   // FIXME: This load reordering might break AllSameOpcode in some rare cases
2147   // such as-
2148   // add a[0],c[0]  load b[0]
2149   // add a[1],c[2]  load b[1]
2150   // b[2]           load b[2]
2151   // add a[3],c[3]  load b[3]
2152   for (unsigned j = 0; j < VL.size() - 1; ++j) {
2153     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2154       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2155         if (isConsecutiveAccess(L, L1, *DL, *SE)) {
2156           std::swap(Left[j + 1], Right[j + 1]);
2157           continue;
2158         }
2159       }
2160     }
2161     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2162       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2163         if (isConsecutiveAccess(L, L1, *DL, *SE)) {
2164           std::swap(Left[j + 1], Right[j + 1]);
2165           continue;
2166         }
2167       }
2168     }
2169     // else unchanged
2170   }
2171 }
2172 
2173 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
2174   Instruction *VL0 = cast<Instruction>(VL[0]);
2175   BasicBlock::iterator NextInst(VL0);
2176   ++NextInst;
2177   Builder.SetInsertPoint(VL0->getParent(), NextInst);
2178   Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
2179 }
2180 
2181 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
2182   Value *Vec = UndefValue::get(Ty);
2183   // Generate the 'InsertElement' instruction.
2184   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
2185     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2186     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2187       GatherSeq.insert(Insrt);
2188       CSEBlocks.insert(Insrt->getParent());
2189 
2190       // Add to our 'need-to-extract' list.
2191       if (ScalarToTreeEntry.count(VL[i])) {
2192         int Idx = ScalarToTreeEntry[VL[i]];
2193         TreeEntry *E = &VectorizableTree[Idx];
2194         // Find which lane we need to extract.
2195         int FoundLane = -1;
2196         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
2197           // Is this the lane of the scalar that we are looking for ?
2198           if (E->Scalars[Lane] == VL[i]) {
2199             FoundLane = Lane;
2200             break;
2201           }
2202         }
2203         assert(FoundLane >= 0 && "Could not find the correct lane");
2204         ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2205       }
2206     }
2207   }
2208 
2209   return Vec;
2210 }
2211 
2212 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
2213   SmallDenseMap<Value*, int>::const_iterator Entry
2214     = ScalarToTreeEntry.find(VL[0]);
2215   if (Entry != ScalarToTreeEntry.end()) {
2216     int Idx = Entry->second;
2217     const TreeEntry *En = &VectorizableTree[Idx];
2218     if (En->isSame(VL) && En->VectorizedValue)
2219       return En->VectorizedValue;
2220   }
2221   return nullptr;
2222 }
2223 
2224 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
2225   if (ScalarToTreeEntry.count(VL[0])) {
2226     int Idx = ScalarToTreeEntry[VL[0]];
2227     TreeEntry *E = &VectorizableTree[Idx];
2228     if (E->isSame(VL))
2229       return vectorizeTree(E);
2230   }
2231 
2232   Type *ScalarTy = VL[0]->getType();
2233   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2234     ScalarTy = SI->getValueOperand()->getType();
2235   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2236 
2237   return Gather(VL, VecTy);
2238 }
2239 
2240 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2241   IRBuilder<>::InsertPointGuard Guard(Builder);
2242 
2243   if (E->VectorizedValue) {
2244     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
2245     return E->VectorizedValue;
2246   }
2247 
2248   Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2249   Type *ScalarTy = VL0->getType();
2250   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
2251     ScalarTy = SI->getValueOperand()->getType();
2252   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
2253 
2254   if (E->NeedToGather) {
2255     setInsertPointAfterBundle(E->Scalars);
2256     return Gather(E->Scalars, VecTy);
2257   }
2258 
2259   unsigned Opcode = getSameOpcode(E->Scalars);
2260 
2261   switch (Opcode) {
2262     case Instruction::PHI: {
2263       PHINode *PH = dyn_cast<PHINode>(VL0);
2264       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
2265       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2266       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
2267       E->VectorizedValue = NewPhi;
2268 
2269       // PHINodes may have multiple entries from the same block. We want to
2270       // visit every block once.
2271       SmallSet<BasicBlock*, 4> VisitedBBs;
2272 
2273       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2274         ValueList Operands;
2275         BasicBlock *IBB = PH->getIncomingBlock(i);
2276 
2277         if (!VisitedBBs.insert(IBB).second) {
2278           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
2279           continue;
2280         }
2281 
2282         // Prepare the operand vector.
2283         for (Value *V : E->Scalars)
2284           Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2285 
2286         Builder.SetInsertPoint(IBB->getTerminator());
2287         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2288         Value *Vec = vectorizeTree(Operands);
2289         NewPhi->addIncoming(Vec, IBB);
2290       }
2291 
2292       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
2293              "Invalid number of incoming values");
2294       return NewPhi;
2295     }
2296 
2297     case Instruction::ExtractElement: {
2298       if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
2299         Value *V = VL0->getOperand(0);
2300         E->VectorizedValue = V;
2301         return V;
2302       }
2303       return Gather(E->Scalars, VecTy);
2304     }
2305     case Instruction::ExtractValue: {
2306       if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
2307         LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
2308         Builder.SetInsertPoint(LI);
2309         PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
2310         Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
2311         LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
2312         E->VectorizedValue = V;
2313         return propagateMetadata(V, E->Scalars);
2314       }
2315       return Gather(E->Scalars, VecTy);
2316     }
2317     case Instruction::ZExt:
2318     case Instruction::SExt:
2319     case Instruction::FPToUI:
2320     case Instruction::FPToSI:
2321     case Instruction::FPExt:
2322     case Instruction::PtrToInt:
2323     case Instruction::IntToPtr:
2324     case Instruction::SIToFP:
2325     case Instruction::UIToFP:
2326     case Instruction::Trunc:
2327     case Instruction::FPTrunc:
2328     case Instruction::BitCast: {
2329       ValueList INVL;
2330       for (Value *V : E->Scalars)
2331         INVL.push_back(cast<Instruction>(V)->getOperand(0));
2332 
2333       setInsertPointAfterBundle(E->Scalars);
2334 
2335       Value *InVec = vectorizeTree(INVL);
2336 
2337       if (Value *V = alreadyVectorized(E->Scalars))
2338         return V;
2339 
2340       CastInst *CI = dyn_cast<CastInst>(VL0);
2341       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2342       E->VectorizedValue = V;
2343       ++NumVectorInstructions;
2344       return V;
2345     }
2346     case Instruction::FCmp:
2347     case Instruction::ICmp: {
2348       ValueList LHSV, RHSV;
2349       for (Value *V : E->Scalars) {
2350         LHSV.push_back(cast<Instruction>(V)->getOperand(0));
2351         RHSV.push_back(cast<Instruction>(V)->getOperand(1));
2352       }
2353 
2354       setInsertPointAfterBundle(E->Scalars);
2355 
2356       Value *L = vectorizeTree(LHSV);
2357       Value *R = vectorizeTree(RHSV);
2358 
2359       if (Value *V = alreadyVectorized(E->Scalars))
2360         return V;
2361 
2362       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2363       Value *V;
2364       if (Opcode == Instruction::FCmp)
2365         V = Builder.CreateFCmp(P0, L, R);
2366       else
2367         V = Builder.CreateICmp(P0, L, R);
2368 
2369       E->VectorizedValue = V;
2370       ++NumVectorInstructions;
2371       return V;
2372     }
2373     case Instruction::Select: {
2374       ValueList TrueVec, FalseVec, CondVec;
2375       for (Value *V : E->Scalars) {
2376         CondVec.push_back(cast<Instruction>(V)->getOperand(0));
2377         TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
2378         FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
2379       }
2380 
2381       setInsertPointAfterBundle(E->Scalars);
2382 
2383       Value *Cond = vectorizeTree(CondVec);
2384       Value *True = vectorizeTree(TrueVec);
2385       Value *False = vectorizeTree(FalseVec);
2386 
2387       if (Value *V = alreadyVectorized(E->Scalars))
2388         return V;
2389 
2390       Value *V = Builder.CreateSelect(Cond, True, False);
2391       E->VectorizedValue = V;
2392       ++NumVectorInstructions;
2393       return V;
2394     }
2395     case Instruction::Add:
2396     case Instruction::FAdd:
2397     case Instruction::Sub:
2398     case Instruction::FSub:
2399     case Instruction::Mul:
2400     case Instruction::FMul:
2401     case Instruction::UDiv:
2402     case Instruction::SDiv:
2403     case Instruction::FDiv:
2404     case Instruction::URem:
2405     case Instruction::SRem:
2406     case Instruction::FRem:
2407     case Instruction::Shl:
2408     case Instruction::LShr:
2409     case Instruction::AShr:
2410     case Instruction::And:
2411     case Instruction::Or:
2412     case Instruction::Xor: {
2413       ValueList LHSVL, RHSVL;
2414       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2415         reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2416       else
2417         for (Value *V : E->Scalars) {
2418           LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
2419           RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
2420         }
2421 
2422       setInsertPointAfterBundle(E->Scalars);
2423 
2424       Value *LHS = vectorizeTree(LHSVL);
2425       Value *RHS = vectorizeTree(RHSVL);
2426 
2427       if (LHS == RHS && isa<Instruction>(LHS)) {
2428         assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
2429       }
2430 
2431       if (Value *V = alreadyVectorized(E->Scalars))
2432         return V;
2433 
2434       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2435       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2436       E->VectorizedValue = V;
2437       propagateIRFlags(E->VectorizedValue, E->Scalars);
2438       ++NumVectorInstructions;
2439 
2440       if (Instruction *I = dyn_cast<Instruction>(V))
2441         return propagateMetadata(I, E->Scalars);
2442 
2443       return V;
2444     }
2445     case Instruction::Load: {
2446       // Loads are inserted at the head of the tree because we don't want to
2447       // sink them all the way down past store instructions.
2448       setInsertPointAfterBundle(E->Scalars);
2449 
2450       LoadInst *LI = cast<LoadInst>(VL0);
2451       Type *ScalarLoadTy = LI->getType();
2452       unsigned AS = LI->getPointerAddressSpace();
2453 
2454       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2455                                             VecTy->getPointerTo(AS));
2456 
2457       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2458       // ExternalUses list to make sure that an extract will be generated in the
2459       // future.
2460       if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2461         ExternalUses.push_back(
2462             ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2463 
2464       unsigned Alignment = LI->getAlignment();
2465       LI = Builder.CreateLoad(VecPtr);
2466       if (!Alignment) {
2467         Alignment = DL->getABITypeAlignment(ScalarLoadTy);
2468       }
2469       LI->setAlignment(Alignment);
2470       E->VectorizedValue = LI;
2471       ++NumVectorInstructions;
2472       return propagateMetadata(LI, E->Scalars);
2473     }
2474     case Instruction::Store: {
2475       StoreInst *SI = cast<StoreInst>(VL0);
2476       unsigned Alignment = SI->getAlignment();
2477       unsigned AS = SI->getPointerAddressSpace();
2478 
2479       ValueList ValueOp;
2480       for (Value *V : E->Scalars)
2481         ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
2482 
2483       setInsertPointAfterBundle(E->Scalars);
2484 
2485       Value *VecValue = vectorizeTree(ValueOp);
2486       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2487                                             VecTy->getPointerTo(AS));
2488       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2489 
2490       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2491       // ExternalUses list to make sure that an extract will be generated in the
2492       // future.
2493       if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2494         ExternalUses.push_back(
2495             ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2496 
2497       if (!Alignment) {
2498         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
2499       }
2500       S->setAlignment(Alignment);
2501       E->VectorizedValue = S;
2502       ++NumVectorInstructions;
2503       return propagateMetadata(S, E->Scalars);
2504     }
2505     case Instruction::GetElementPtr: {
2506       setInsertPointAfterBundle(E->Scalars);
2507 
2508       ValueList Op0VL;
2509       for (Value *V : E->Scalars)
2510         Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
2511 
2512       Value *Op0 = vectorizeTree(Op0VL);
2513 
2514       std::vector<Value *> OpVecs;
2515       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2516            ++j) {
2517         ValueList OpVL;
2518         for (Value *V : E->Scalars)
2519           OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
2520 
2521         Value *OpVec = vectorizeTree(OpVL);
2522         OpVecs.push_back(OpVec);
2523       }
2524 
2525       Value *V = Builder.CreateGEP(
2526           cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
2527       E->VectorizedValue = V;
2528       ++NumVectorInstructions;
2529 
2530       if (Instruction *I = dyn_cast<Instruction>(V))
2531         return propagateMetadata(I, E->Scalars);
2532 
2533       return V;
2534     }
2535     case Instruction::Call: {
2536       CallInst *CI = cast<CallInst>(VL0);
2537       setInsertPointAfterBundle(E->Scalars);
2538       Function *FI;
2539       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
2540       Value *ScalarArg = nullptr;
2541       if (CI && (FI = CI->getCalledFunction())) {
2542         IID = FI->getIntrinsicID();
2543       }
2544       std::vector<Value *> OpVecs;
2545       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2546         ValueList OpVL;
2547         // ctlz,cttz and powi are special intrinsics whose second argument is
2548         // a scalar. This argument should not be vectorized.
2549         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2550           CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2551           ScalarArg = CEI->getArgOperand(j);
2552           OpVecs.push_back(CEI->getArgOperand(j));
2553           continue;
2554         }
2555         for (Value *V : E->Scalars) {
2556           CallInst *CEI = cast<CallInst>(V);
2557           OpVL.push_back(CEI->getArgOperand(j));
2558         }
2559 
2560         Value *OpVec = vectorizeTree(OpVL);
2561         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2562         OpVecs.push_back(OpVec);
2563       }
2564 
2565       Module *M = F->getParent();
2566       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2567       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2568       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2569       SmallVector<OperandBundleDef, 1> OpBundles;
2570       CI->getOperandBundlesAsDefs(OpBundles);
2571       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
2572 
2573       // The scalar argument uses an in-tree scalar so we add the new vectorized
2574       // call to ExternalUses list to make sure that an extract will be
2575       // generated in the future.
2576       if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2577         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2578 
2579       E->VectorizedValue = V;
2580       ++NumVectorInstructions;
2581       return V;
2582     }
2583     case Instruction::ShuffleVector: {
2584       ValueList LHSVL, RHSVL;
2585       assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
2586       reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
2587       setInsertPointAfterBundle(E->Scalars);
2588 
2589       Value *LHS = vectorizeTree(LHSVL);
2590       Value *RHS = vectorizeTree(RHSVL);
2591 
2592       if (Value *V = alreadyVectorized(E->Scalars))
2593         return V;
2594 
2595       // Create a vector of LHS op1 RHS
2596       BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2597       Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2598 
2599       // Create a vector of LHS op2 RHS
2600       Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2601       BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2602       Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2603 
2604       // Create shuffle to take alternate operations from the vector.
2605       // Also, gather up odd and even scalar ops to propagate IR flags to
2606       // each vector operation.
2607       ValueList OddScalars, EvenScalars;
2608       unsigned e = E->Scalars.size();
2609       SmallVector<Constant *, 8> Mask(e);
2610       for (unsigned i = 0; i < e; ++i) {
2611         if (i & 1) {
2612           Mask[i] = Builder.getInt32(e + i);
2613           OddScalars.push_back(E->Scalars[i]);
2614         } else {
2615           Mask[i] = Builder.getInt32(i);
2616           EvenScalars.push_back(E->Scalars[i]);
2617         }
2618       }
2619 
2620       Value *ShuffleMask = ConstantVector::get(Mask);
2621       propagateIRFlags(V0, EvenScalars);
2622       propagateIRFlags(V1, OddScalars);
2623 
2624       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2625       E->VectorizedValue = V;
2626       ++NumVectorInstructions;
2627       if (Instruction *I = dyn_cast<Instruction>(V))
2628         return propagateMetadata(I, E->Scalars);
2629 
2630       return V;
2631     }
2632     default:
2633     llvm_unreachable("unknown inst");
2634   }
2635   return nullptr;
2636 }
2637 
2638 Value *BoUpSLP::vectorizeTree() {
2639 
2640   // All blocks must be scheduled before any instructions are inserted.
2641   for (auto &BSIter : BlocksSchedules) {
2642     scheduleBlock(BSIter.second.get());
2643   }
2644 
2645   Builder.SetInsertPoint(&F->getEntryBlock().front());
2646   auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
2647 
2648   // If the vectorized tree can be rewritten in a smaller type, we truncate the
2649   // vectorized root. InstCombine will then rewrite the entire expression. We
2650   // sign extend the extracted values below.
2651   auto *ScalarRoot = VectorizableTree[0].Scalars[0];
2652   if (MinBWs.count(ScalarRoot)) {
2653     if (auto *I = dyn_cast<Instruction>(VectorRoot))
2654       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
2655     auto BundleWidth = VectorizableTree[0].Scalars.size();
2656     auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
2657     auto *VecTy = VectorType::get(MinTy, BundleWidth);
2658     auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
2659     VectorizableTree[0].VectorizedValue = Trunc;
2660   }
2661 
2662   DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2663 
2664   // Extract all of the elements with the external uses.
2665   for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2666        it != e; ++it) {
2667     Value *Scalar = it->Scalar;
2668     llvm::User *User = it->User;
2669 
2670     // Skip users that we already RAUW. This happens when one instruction
2671     // has multiple uses of the same value.
2672     if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
2673         Scalar->user_end())
2674       continue;
2675     assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2676 
2677     int Idx = ScalarToTreeEntry[Scalar];
2678     TreeEntry *E = &VectorizableTree[Idx];
2679     assert(!E->NeedToGather && "Extracting from a gather list");
2680 
2681     Value *Vec = E->VectorizedValue;
2682     assert(Vec && "Can't find vectorizable value");
2683 
2684     Value *Lane = Builder.getInt32(it->Lane);
2685     // Generate extracts for out-of-tree users.
2686     // Find the insertion point for the extractelement lane.
2687     if (auto *VecI = dyn_cast<Instruction>(Vec)) {
2688       if (PHINode *PH = dyn_cast<PHINode>(User)) {
2689         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2690           if (PH->getIncomingValue(i) == Scalar) {
2691             TerminatorInst *IncomingTerminator =
2692                 PH->getIncomingBlock(i)->getTerminator();
2693             if (isa<CatchSwitchInst>(IncomingTerminator)) {
2694               Builder.SetInsertPoint(VecI->getParent(),
2695                                      std::next(VecI->getIterator()));
2696             } else {
2697               Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2698             }
2699             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2700             if (MinBWs.count(ScalarRoot))
2701               Ex = Builder.CreateSExt(Ex, Scalar->getType());
2702             CSEBlocks.insert(PH->getIncomingBlock(i));
2703             PH->setOperand(i, Ex);
2704           }
2705         }
2706       } else {
2707         Builder.SetInsertPoint(cast<Instruction>(User));
2708         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2709         if (MinBWs.count(ScalarRoot))
2710           Ex = Builder.CreateSExt(Ex, Scalar->getType());
2711         CSEBlocks.insert(cast<Instruction>(User)->getParent());
2712         User->replaceUsesOfWith(Scalar, Ex);
2713      }
2714     } else {
2715       Builder.SetInsertPoint(&F->getEntryBlock().front());
2716       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2717       if (MinBWs.count(ScalarRoot))
2718         Ex = Builder.CreateSExt(Ex, Scalar->getType());
2719       CSEBlocks.insert(&F->getEntryBlock());
2720       User->replaceUsesOfWith(Scalar, Ex);
2721     }
2722 
2723     DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2724   }
2725 
2726   // For each vectorized value:
2727   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2728     TreeEntry *Entry = &VectorizableTree[EIdx];
2729 
2730     // For each lane:
2731     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2732       Value *Scalar = Entry->Scalars[Lane];
2733       // No need to handle users of gathered values.
2734       if (Entry->NeedToGather)
2735         continue;
2736 
2737       assert(Entry->VectorizedValue && "Can't find vectorizable value");
2738 
2739       Type *Ty = Scalar->getType();
2740       if (!Ty->isVoidTy()) {
2741 #ifndef NDEBUG
2742         for (User *U : Scalar->users()) {
2743           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2744 
2745           assert((ScalarToTreeEntry.count(U) ||
2746                   // It is legal to replace users in the ignorelist by undef.
2747                   (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2748                    UserIgnoreList.end())) &&
2749                  "Replacing out-of-tree value with undef");
2750         }
2751 #endif
2752         Value *Undef = UndefValue::get(Ty);
2753         Scalar->replaceAllUsesWith(Undef);
2754       }
2755       DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2756       eraseInstruction(cast<Instruction>(Scalar));
2757     }
2758   }
2759 
2760   Builder.ClearInsertionPoint();
2761 
2762   return VectorizableTree[0].VectorizedValue;
2763 }
2764 
2765 void BoUpSLP::optimizeGatherSequence() {
2766   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2767         << " gather sequences instructions.\n");
2768   // LICM InsertElementInst sequences.
2769   for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
2770        e = GatherSeq.end(); it != e; ++it) {
2771     InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
2772 
2773     if (!Insert)
2774       continue;
2775 
2776     // Check if this block is inside a loop.
2777     Loop *L = LI->getLoopFor(Insert->getParent());
2778     if (!L)
2779       continue;
2780 
2781     // Check if it has a preheader.
2782     BasicBlock *PreHeader = L->getLoopPreheader();
2783     if (!PreHeader)
2784       continue;
2785 
2786     // If the vector or the element that we insert into it are
2787     // instructions that are defined in this basic block then we can't
2788     // hoist this instruction.
2789     Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2790     Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2791     if (CurrVec && L->contains(CurrVec))
2792       continue;
2793     if (NewElem && L->contains(NewElem))
2794       continue;
2795 
2796     // We can hoist this instruction. Move it to the pre-header.
2797     Insert->moveBefore(PreHeader->getTerminator());
2798   }
2799 
2800   // Make a list of all reachable blocks in our CSE queue.
2801   SmallVector<const DomTreeNode *, 8> CSEWorkList;
2802   CSEWorkList.reserve(CSEBlocks.size());
2803   for (BasicBlock *BB : CSEBlocks)
2804     if (DomTreeNode *N = DT->getNode(BB)) {
2805       assert(DT->isReachableFromEntry(N));
2806       CSEWorkList.push_back(N);
2807     }
2808 
2809   // Sort blocks by domination. This ensures we visit a block after all blocks
2810   // dominating it are visited.
2811   std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2812                    [this](const DomTreeNode *A, const DomTreeNode *B) {
2813     return DT->properlyDominates(A, B);
2814   });
2815 
2816   // Perform O(N^2) search over the gather sequences and merge identical
2817   // instructions. TODO: We can further optimize this scan if we split the
2818   // instructions into different buckets based on the insert lane.
2819   SmallVector<Instruction *, 16> Visited;
2820   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2821     assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2822            "Worklist not sorted properly!");
2823     BasicBlock *BB = (*I)->getBlock();
2824     // For all instructions in blocks containing gather sequences:
2825     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2826       Instruction *In = &*it++;
2827       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2828         continue;
2829 
2830       // Check if we can replace this instruction with any of the
2831       // visited instructions.
2832       for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
2833                                                     ve = Visited.end();
2834            v != ve; ++v) {
2835         if (In->isIdenticalTo(*v) &&
2836             DT->dominates((*v)->getParent(), In->getParent())) {
2837           In->replaceAllUsesWith(*v);
2838           eraseInstruction(In);
2839           In = nullptr;
2840           break;
2841         }
2842       }
2843       if (In) {
2844         assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
2845         Visited.push_back(In);
2846       }
2847     }
2848   }
2849   CSEBlocks.clear();
2850   GatherSeq.clear();
2851 }
2852 
2853 // Groups the instructions to a bundle (which is then a single scheduling entity)
2854 // and schedules instructions until the bundle gets ready.
2855 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2856                                                  BoUpSLP *SLP) {
2857   if (isa<PHINode>(VL[0]))
2858     return true;
2859 
2860   // Initialize the instruction bundle.
2861   Instruction *OldScheduleEnd = ScheduleEnd;
2862   ScheduleData *PrevInBundle = nullptr;
2863   ScheduleData *Bundle = nullptr;
2864   bool ReSchedule = false;
2865   DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
2866 
2867   // Make sure that the scheduling region contains all
2868   // instructions of the bundle.
2869   for (Value *V : VL) {
2870     if (!extendSchedulingRegion(V))
2871       return false;
2872   }
2873 
2874   for (Value *V : VL) {
2875     ScheduleData *BundleMember = getScheduleData(V);
2876     assert(BundleMember &&
2877            "no ScheduleData for bundle member (maybe not in same basic block)");
2878     if (BundleMember->IsScheduled) {
2879       // A bundle member was scheduled as single instruction before and now
2880       // needs to be scheduled as part of the bundle. We just get rid of the
2881       // existing schedule.
2882       DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
2883                    << " was already scheduled\n");
2884       ReSchedule = true;
2885     }
2886     assert(BundleMember->isSchedulingEntity() &&
2887            "bundle member already part of other bundle");
2888     if (PrevInBundle) {
2889       PrevInBundle->NextInBundle = BundleMember;
2890     } else {
2891       Bundle = BundleMember;
2892     }
2893     BundleMember->UnscheduledDepsInBundle = 0;
2894     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2895 
2896     // Group the instructions to a bundle.
2897     BundleMember->FirstInBundle = Bundle;
2898     PrevInBundle = BundleMember;
2899   }
2900   if (ScheduleEnd != OldScheduleEnd) {
2901     // The scheduling region got new instructions at the lower end (or it is a
2902     // new region for the first bundle). This makes it necessary to
2903     // recalculate all dependencies.
2904     // It is seldom that this needs to be done a second time after adding the
2905     // initial bundle to the region.
2906     for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2907       ScheduleData *SD = getScheduleData(I);
2908       SD->clearDependencies();
2909     }
2910     ReSchedule = true;
2911   }
2912   if (ReSchedule) {
2913     resetSchedule();
2914     initialFillReadyList(ReadyInsts);
2915   }
2916 
2917   DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2918                << BB->getName() << "\n");
2919 
2920   calculateDependencies(Bundle, true, SLP);
2921 
2922   // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2923   // means that there are no cyclic dependencies and we can schedule it.
2924   // Note that's important that we don't "schedule" the bundle yet (see
2925   // cancelScheduling).
2926   while (!Bundle->isReady() && !ReadyInsts.empty()) {
2927 
2928     ScheduleData *pickedSD = ReadyInsts.back();
2929     ReadyInsts.pop_back();
2930 
2931     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2932       schedule(pickedSD, ReadyInsts);
2933     }
2934   }
2935   if (!Bundle->isReady()) {
2936     cancelScheduling(VL);
2937     return false;
2938   }
2939   return true;
2940 }
2941 
2942 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
2943   if (isa<PHINode>(VL[0]))
2944     return;
2945 
2946   ScheduleData *Bundle = getScheduleData(VL[0]);
2947   DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
2948   assert(!Bundle->IsScheduled &&
2949          "Can't cancel bundle which is already scheduled");
2950   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2951          "tried to unbundle something which is not a bundle");
2952 
2953   // Un-bundle: make single instructions out of the bundle.
2954   ScheduleData *BundleMember = Bundle;
2955   while (BundleMember) {
2956     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
2957     BundleMember->FirstInBundle = BundleMember;
2958     ScheduleData *Next = BundleMember->NextInBundle;
2959     BundleMember->NextInBundle = nullptr;
2960     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2961     if (BundleMember->UnscheduledDepsInBundle == 0) {
2962       ReadyInsts.insert(BundleMember);
2963     }
2964     BundleMember = Next;
2965   }
2966 }
2967 
2968 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
2969   if (getScheduleData(V))
2970     return true;
2971   Instruction *I = dyn_cast<Instruction>(V);
2972   assert(I && "bundle member must be an instruction");
2973   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
2974   if (!ScheduleStart) {
2975     // It's the first instruction in the new region.
2976     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
2977     ScheduleStart = I;
2978     ScheduleEnd = I->getNextNode();
2979     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2980     DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
2981     return true;
2982   }
2983   // Search up and down at the same time, because we don't know if the new
2984   // instruction is above or below the existing scheduling region.
2985   BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());
2986   BasicBlock::reverse_iterator UpperEnd = BB->rend();
2987   BasicBlock::iterator DownIter(ScheduleEnd);
2988   BasicBlock::iterator LowerEnd = BB->end();
2989   for (;;) {
2990     if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
2991       DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
2992       return false;
2993     }
2994 
2995     if (UpIter != UpperEnd) {
2996       if (&*UpIter == I) {
2997         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
2998         ScheduleStart = I;
2999         DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
3000         return true;
3001       }
3002       UpIter++;
3003     }
3004     if (DownIter != LowerEnd) {
3005       if (&*DownIter == I) {
3006         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
3007                          nullptr);
3008         ScheduleEnd = I->getNextNode();
3009         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
3010         DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
3011         return true;
3012       }
3013       DownIter++;
3014     }
3015     assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
3016            "instruction not found in block");
3017   }
3018   return true;
3019 }
3020 
3021 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
3022                                                 Instruction *ToI,
3023                                                 ScheduleData *PrevLoadStore,
3024                                                 ScheduleData *NextLoadStore) {
3025   ScheduleData *CurrentLoadStore = PrevLoadStore;
3026   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
3027     ScheduleData *SD = ScheduleDataMap[I];
3028     if (!SD) {
3029       // Allocate a new ScheduleData for the instruction.
3030       if (ChunkPos >= ChunkSize) {
3031         ScheduleDataChunks.push_back(
3032             llvm::make_unique<ScheduleData[]>(ChunkSize));
3033         ChunkPos = 0;
3034       }
3035       SD = &(ScheduleDataChunks.back()[ChunkPos++]);
3036       ScheduleDataMap[I] = SD;
3037       SD->Inst = I;
3038     }
3039     assert(!isInSchedulingRegion(SD) &&
3040            "new ScheduleData already in scheduling region");
3041     SD->init(SchedulingRegionID);
3042 
3043     if (I->mayReadOrWriteMemory()) {
3044       // Update the linked list of memory accessing instructions.
3045       if (CurrentLoadStore) {
3046         CurrentLoadStore->NextLoadStore = SD;
3047       } else {
3048         FirstLoadStoreInRegion = SD;
3049       }
3050       CurrentLoadStore = SD;
3051     }
3052   }
3053   if (NextLoadStore) {
3054     if (CurrentLoadStore)
3055       CurrentLoadStore->NextLoadStore = NextLoadStore;
3056   } else {
3057     LastLoadStoreInRegion = CurrentLoadStore;
3058   }
3059 }
3060 
3061 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
3062                                                      bool InsertInReadyList,
3063                                                      BoUpSLP *SLP) {
3064   assert(SD->isSchedulingEntity());
3065 
3066   SmallVector<ScheduleData *, 10> WorkList;
3067   WorkList.push_back(SD);
3068 
3069   while (!WorkList.empty()) {
3070     ScheduleData *SD = WorkList.back();
3071     WorkList.pop_back();
3072 
3073     ScheduleData *BundleMember = SD;
3074     while (BundleMember) {
3075       assert(isInSchedulingRegion(BundleMember));
3076       if (!BundleMember->hasValidDependencies()) {
3077 
3078         DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
3079         BundleMember->Dependencies = 0;
3080         BundleMember->resetUnscheduledDeps();
3081 
3082         // Handle def-use chain dependencies.
3083         for (User *U : BundleMember->Inst->users()) {
3084           if (isa<Instruction>(U)) {
3085             ScheduleData *UseSD = getScheduleData(U);
3086             if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
3087               BundleMember->Dependencies++;
3088               ScheduleData *DestBundle = UseSD->FirstInBundle;
3089               if (!DestBundle->IsScheduled) {
3090                 BundleMember->incrementUnscheduledDeps(1);
3091               }
3092               if (!DestBundle->hasValidDependencies()) {
3093                 WorkList.push_back(DestBundle);
3094               }
3095             }
3096           } else {
3097             // I'm not sure if this can ever happen. But we need to be safe.
3098             // This lets the instruction/bundle never be scheduled and
3099             // eventually disable vectorization.
3100             BundleMember->Dependencies++;
3101             BundleMember->incrementUnscheduledDeps(1);
3102           }
3103         }
3104 
3105         // Handle the memory dependencies.
3106         ScheduleData *DepDest = BundleMember->NextLoadStore;
3107         if (DepDest) {
3108           Instruction *SrcInst = BundleMember->Inst;
3109           MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
3110           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
3111           unsigned numAliased = 0;
3112           unsigned DistToSrc = 1;
3113 
3114           while (DepDest) {
3115             assert(isInSchedulingRegion(DepDest));
3116 
3117             // We have two limits to reduce the complexity:
3118             // 1) AliasedCheckLimit: It's a small limit to reduce calls to
3119             //    SLP->isAliased (which is the expensive part in this loop).
3120             // 2) MaxMemDepDistance: It's for very large blocks and it aborts
3121             //    the whole loop (even if the loop is fast, it's quadratic).
3122             //    It's important for the loop break condition (see below) to
3123             //    check this limit even between two read-only instructions.
3124             if (DistToSrc >= MaxMemDepDistance ||
3125                     ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
3126                      (numAliased >= AliasedCheckLimit ||
3127                       SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
3128 
3129               // We increment the counter only if the locations are aliased
3130               // (instead of counting all alias checks). This gives a better
3131               // balance between reduced runtime and accurate dependencies.
3132               numAliased++;
3133 
3134               DepDest->MemoryDependencies.push_back(BundleMember);
3135               BundleMember->Dependencies++;
3136               ScheduleData *DestBundle = DepDest->FirstInBundle;
3137               if (!DestBundle->IsScheduled) {
3138                 BundleMember->incrementUnscheduledDeps(1);
3139               }
3140               if (!DestBundle->hasValidDependencies()) {
3141                 WorkList.push_back(DestBundle);
3142               }
3143             }
3144             DepDest = DepDest->NextLoadStore;
3145 
3146             // Example, explaining the loop break condition: Let's assume our
3147             // starting instruction is i0 and MaxMemDepDistance = 3.
3148             //
3149             //                      +--------v--v--v
3150             //             i0,i1,i2,i3,i4,i5,i6,i7,i8
3151             //             +--------^--^--^
3152             //
3153             // MaxMemDepDistance let us stop alias-checking at i3 and we add
3154             // dependencies from i0 to i3,i4,.. (even if they are not aliased).
3155             // Previously we already added dependencies from i3 to i6,i7,i8
3156             // (because of MaxMemDepDistance). As we added a dependency from
3157             // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
3158             // and we can abort this loop at i6.
3159             if (DistToSrc >= 2 * MaxMemDepDistance)
3160                 break;
3161             DistToSrc++;
3162           }
3163         }
3164       }
3165       BundleMember = BundleMember->NextInBundle;
3166     }
3167     if (InsertInReadyList && SD->isReady()) {
3168       ReadyInsts.push_back(SD);
3169       DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
3170     }
3171   }
3172 }
3173 
3174 void BoUpSLP::BlockScheduling::resetSchedule() {
3175   assert(ScheduleStart &&
3176          "tried to reset schedule on block which has not been scheduled");
3177   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3178     ScheduleData *SD = getScheduleData(I);
3179     assert(isInSchedulingRegion(SD));
3180     SD->IsScheduled = false;
3181     SD->resetUnscheduledDeps();
3182   }
3183   ReadyInsts.clear();
3184 }
3185 
3186 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
3187 
3188   if (!BS->ScheduleStart)
3189     return;
3190 
3191   DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
3192 
3193   BS->resetSchedule();
3194 
3195   // For the real scheduling we use a more sophisticated ready-list: it is
3196   // sorted by the original instruction location. This lets the final schedule
3197   // be as  close as possible to the original instruction order.
3198   struct ScheduleDataCompare {
3199     bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
3200       return SD2->SchedulingPriority < SD1->SchedulingPriority;
3201     }
3202   };
3203   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3204 
3205   // Ensure that all dependency data is updated and fill the ready-list with
3206   // initial instructions.
3207   int Idx = 0;
3208   int NumToSchedule = 0;
3209   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3210        I = I->getNextNode()) {
3211     ScheduleData *SD = BS->getScheduleData(I);
3212     assert(
3213         SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
3214         "scheduler and vectorizer have different opinion on what is a bundle");
3215     SD->FirstInBundle->SchedulingPriority = Idx++;
3216     if (SD->isSchedulingEntity()) {
3217       BS->calculateDependencies(SD, false, this);
3218       NumToSchedule++;
3219     }
3220   }
3221   BS->initialFillReadyList(ReadyInsts);
3222 
3223   Instruction *LastScheduledInst = BS->ScheduleEnd;
3224 
3225   // Do the "real" scheduling.
3226   while (!ReadyInsts.empty()) {
3227     ScheduleData *picked = *ReadyInsts.begin();
3228     ReadyInsts.erase(ReadyInsts.begin());
3229 
3230     // Move the scheduled instruction(s) to their dedicated places, if not
3231     // there yet.
3232     ScheduleData *BundleMember = picked;
3233     while (BundleMember) {
3234       Instruction *pickedInst = BundleMember->Inst;
3235       if (LastScheduledInst->getNextNode() != pickedInst) {
3236         BS->BB->getInstList().remove(pickedInst);
3237         BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
3238                                      pickedInst);
3239       }
3240       LastScheduledInst = pickedInst;
3241       BundleMember = BundleMember->NextInBundle;
3242     }
3243 
3244     BS->schedule(picked, ReadyInsts);
3245     NumToSchedule--;
3246   }
3247   assert(NumToSchedule == 0 && "could not schedule all instructions");
3248 
3249   // Avoid duplicate scheduling of the block.
3250   BS->ScheduleStart = nullptr;
3251 }
3252 
3253 unsigned BoUpSLP::getVectorElementSize(Value *V) {
3254   // If V is a store, just return the width of the stored value without
3255   // traversing the expression tree. This is the common case.
3256   if (auto *Store = dyn_cast<StoreInst>(V))
3257     return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
3258 
3259   // If V is not a store, we can traverse the expression tree to find loads
3260   // that feed it. The type of the loaded value may indicate a more suitable
3261   // width than V's type. We want to base the vector element size on the width
3262   // of memory operations where possible.
3263   SmallVector<Instruction *, 16> Worklist;
3264   SmallPtrSet<Instruction *, 16> Visited;
3265   if (auto *I = dyn_cast<Instruction>(V))
3266     Worklist.push_back(I);
3267 
3268   // Traverse the expression tree in bottom-up order looking for loads. If we
3269   // encounter an instruciton we don't yet handle, we give up.
3270   auto MaxWidth = 0u;
3271   auto FoundUnknownInst = false;
3272   while (!Worklist.empty() && !FoundUnknownInst) {
3273     auto *I = Worklist.pop_back_val();
3274     Visited.insert(I);
3275 
3276     // We should only be looking at scalar instructions here. If the current
3277     // instruction has a vector type, give up.
3278     auto *Ty = I->getType();
3279     if (isa<VectorType>(Ty))
3280       FoundUnknownInst = true;
3281 
3282     // If the current instruction is a load, update MaxWidth to reflect the
3283     // width of the loaded value.
3284     else if (isa<LoadInst>(I))
3285       MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
3286 
3287     // Otherwise, we need to visit the operands of the instruction. We only
3288     // handle the interesting cases from buildTree here. If an operand is an
3289     // instruction we haven't yet visited, we add it to the worklist.
3290     else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
3291              isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
3292       for (Use &U : I->operands())
3293         if (auto *J = dyn_cast<Instruction>(U.get()))
3294           if (!Visited.count(J))
3295             Worklist.push_back(J);
3296     }
3297 
3298     // If we don't yet handle the instruction, give up.
3299     else
3300       FoundUnknownInst = true;
3301   }
3302 
3303   // If we didn't encounter a memory access in the expression tree, or if we
3304   // gave up for some reason, just return the width of V.
3305   if (!MaxWidth || FoundUnknownInst)
3306     return DL->getTypeSizeInBits(V->getType());
3307 
3308   // Otherwise, return the maximum width we found.
3309   return MaxWidth;
3310 }
3311 
3312 // Determine if a value V in a vectorizable expression Expr can be demoted to a
3313 // smaller type with a truncation. We collect the values that will be demoted
3314 // in ToDemote and additional roots that require investigating in Roots.
3315 static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
3316                                   SmallVectorImpl<Value *> &ToDemote,
3317                                   SmallVectorImpl<Value *> &Roots) {
3318 
3319   // We can always demote constants.
3320   if (isa<Constant>(V)) {
3321     ToDemote.push_back(V);
3322     return true;
3323   }
3324 
3325   // If the value is not an instruction in the expression with only one use, it
3326   // cannot be demoted.
3327   auto *I = dyn_cast<Instruction>(V);
3328   if (!I || !I->hasOneUse() || !Expr.count(I))
3329     return false;
3330 
3331   switch (I->getOpcode()) {
3332 
3333   // We can always demote truncations and extensions. Since truncations can
3334   // seed additional demotion, we save the truncated value.
3335   case Instruction::Trunc:
3336     Roots.push_back(I->getOperand(0));
3337   case Instruction::ZExt:
3338   case Instruction::SExt:
3339     break;
3340 
3341   // We can demote certain binary operations if we can demote both of their
3342   // operands.
3343   case Instruction::Add:
3344   case Instruction::Sub:
3345   case Instruction::Mul:
3346   case Instruction::And:
3347   case Instruction::Or:
3348   case Instruction::Xor:
3349     if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
3350         !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
3351       return false;
3352     break;
3353 
3354   // We can demote selects if we can demote their true and false values.
3355   case Instruction::Select: {
3356     SelectInst *SI = cast<SelectInst>(I);
3357     if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
3358         !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
3359       return false;
3360     break;
3361   }
3362 
3363   // We can demote phis if we can demote all their incoming operands. Note that
3364   // we don't need to worry about cycles since we ensure single use above.
3365   case Instruction::PHI: {
3366     PHINode *PN = cast<PHINode>(I);
3367     for (Value *IncValue : PN->incoming_values())
3368       if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
3369         return false;
3370     break;
3371   }
3372 
3373   // Otherwise, conservatively give up.
3374   default:
3375     return false;
3376   }
3377 
3378   // Record the value that we can demote.
3379   ToDemote.push_back(V);
3380   return true;
3381 }
3382 
3383 void BoUpSLP::computeMinimumValueSizes() {
3384   // If there are no external uses, the expression tree must be rooted by a
3385   // store. We can't demote in-memory values, so there is nothing to do here.
3386   if (ExternalUses.empty())
3387     return;
3388 
3389   // We only attempt to truncate integer expressions.
3390   auto &TreeRoot = VectorizableTree[0].Scalars;
3391   auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
3392   if (!TreeRootIT)
3393     return;
3394 
3395   // If the expression is not rooted by a store, these roots should have
3396   // external uses. We will rely on InstCombine to rewrite the expression in
3397   // the narrower type. However, InstCombine only rewrites single-use values.
3398   // This means that if a tree entry other than a root is used externally, it
3399   // must have multiple uses and InstCombine will not rewrite it. The code
3400   // below ensures that only the roots are used externally.
3401   SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
3402   for (auto &EU : ExternalUses)
3403     if (!Expr.erase(EU.Scalar))
3404       return;
3405   if (!Expr.empty())
3406     return;
3407 
3408   // Collect the scalar values of the vectorizable expression. We will use this
3409   // context to determine which values can be demoted. If we see a truncation,
3410   // we mark it as seeding another demotion.
3411   for (auto &Entry : VectorizableTree)
3412     Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
3413 
3414   // Ensure the roots of the vectorizable tree don't form a cycle. They must
3415   // have a single external user that is not in the vectorizable tree.
3416   for (auto *Root : TreeRoot)
3417     if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
3418       return;
3419 
3420   // Conservatively determine if we can actually truncate the roots of the
3421   // expression. Collect the values that can be demoted in ToDemote and
3422   // additional roots that require investigating in Roots.
3423   SmallVector<Value *, 32> ToDemote;
3424   SmallVector<Value *, 4> Roots;
3425   for (auto *Root : TreeRoot)
3426     if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
3427       return;
3428 
3429   // The maximum bit width required to represent all the values that can be
3430   // demoted without loss of precision. It would be safe to truncate the roots
3431   // of the expression to this width.
3432   auto MaxBitWidth = 8u;
3433 
3434   // We first check if all the bits of the roots are demanded. If they're not,
3435   // we can truncate the roots to this narrower type.
3436   for (auto *Root : TreeRoot) {
3437     auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
3438     MaxBitWidth = std::max<unsigned>(
3439         Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
3440   }
3441 
3442   // If all the bits of the roots are demanded, we can try a little harder to
3443   // compute a narrower type. This can happen, for example, if the roots are
3444   // getelementptr indices. InstCombine promotes these indices to the pointer
3445   // width. Thus, all their bits are technically demanded even though the
3446   // address computation might be vectorized in a smaller type.
3447   //
3448   // We start by looking at each entry that can be demoted. We compute the
3449   // maximum bit width required to store the scalar by using ValueTracking to
3450   // compute the number of high-order bits we can truncate.
3451   if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
3452     MaxBitWidth = 8u;
3453     for (auto *Scalar : ToDemote) {
3454       auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
3455       auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
3456       MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
3457     }
3458   }
3459 
3460   // Round MaxBitWidth up to the next power-of-two.
3461   if (!isPowerOf2_64(MaxBitWidth))
3462     MaxBitWidth = NextPowerOf2(MaxBitWidth);
3463 
3464   // If the maximum bit width we compute is less than the with of the roots'
3465   // type, we can proceed with the narrowing. Otherwise, do nothing.
3466   if (MaxBitWidth >= TreeRootIT->getBitWidth())
3467     return;
3468 
3469   // If we can truncate the root, we must collect additional values that might
3470   // be demoted as a result. That is, those seeded by truncations we will
3471   // modify.
3472   while (!Roots.empty())
3473     collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
3474 
3475   // Finally, map the values we can demote to the maximum bit with we computed.
3476   for (auto *Scalar : ToDemote)
3477     MinBWs[Scalar] = MaxBitWidth;
3478 }
3479 
3480 /// The SLPVectorizer Pass.
3481 struct SLPVectorizer : public FunctionPass {
3482   typedef SmallVector<StoreInst *, 8> StoreList;
3483   typedef MapVector<Value *, StoreList> StoreListMap;
3484   typedef SmallVector<WeakVH, 8> WeakVHList;
3485   typedef MapVector<Value *, WeakVHList> WeakVHListMap;
3486 
3487   /// Pass identification, replacement for typeid
3488   static char ID;
3489 
3490   explicit SLPVectorizer() : FunctionPass(ID) {
3491     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
3492   }
3493 
3494   ScalarEvolution *SE;
3495   TargetTransformInfo *TTI;
3496   TargetLibraryInfo *TLI;
3497   AliasAnalysis *AA;
3498   LoopInfo *LI;
3499   DominatorTree *DT;
3500   AssumptionCache *AC;
3501   DemandedBits *DB;
3502   const DataLayout *DL;
3503 
3504   bool doInitialization(Module &M) override {
3505     DL = &M.getDataLayout();
3506     return false;
3507   }
3508 
3509   bool runOnFunction(Function &F) override {
3510     if (skipFunction(F))
3511       return false;
3512 
3513     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
3514     TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
3515     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
3516     TLI = TLIP ? &TLIP->getTLI() : nullptr;
3517     AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3518     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
3519     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3520     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3521     DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
3522 
3523     Stores.clear();
3524     GEPs.clear();
3525     bool Changed = false;
3526 
3527     // If the target claims to have no vector registers don't attempt
3528     // vectorization.
3529     if (!TTI->getNumberOfRegisters(true))
3530       return false;
3531 
3532     // Don't vectorize when the attribute NoImplicitFloat is used.
3533     if (F.hasFnAttribute(Attribute::NoImplicitFloat))
3534       return false;
3535 
3536     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
3537 
3538     // Use the bottom up slp vectorizer to construct chains that start with
3539     // store instructions.
3540     BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
3541 
3542     // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
3543     // delete instructions.
3544 
3545     // Scan the blocks in the function in post order.
3546     for (auto BB : post_order(&F.getEntryBlock())) {
3547       collectSeedInstructions(BB);
3548 
3549       // Vectorize trees that end at stores.
3550       if (!Stores.empty()) {
3551         DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
3552                      << " underlying objects.\n");
3553         Changed |= vectorizeStoreChains(R);
3554       }
3555 
3556       // Vectorize trees that end at reductions.
3557       Changed |= vectorizeChainsInBlock(BB, R);
3558 
3559       // Vectorize the index computations of getelementptr instructions. This
3560       // is primarily intended to catch gather-like idioms ending at
3561       // non-consecutive loads.
3562       if (!GEPs.empty()) {
3563         DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
3564                      << " underlying objects.\n");
3565         Changed |= vectorizeGEPIndices(BB, R);
3566       }
3567     }
3568 
3569     if (Changed) {
3570       R.optimizeGatherSequence();
3571       DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
3572       DEBUG(verifyFunction(F));
3573     }
3574     return Changed;
3575   }
3576 
3577   void getAnalysisUsage(AnalysisUsage &AU) const override {
3578     FunctionPass::getAnalysisUsage(AU);
3579     AU.addRequired<AssumptionCacheTracker>();
3580     AU.addRequired<ScalarEvolutionWrapperPass>();
3581     AU.addRequired<AAResultsWrapperPass>();
3582     AU.addRequired<TargetTransformInfoWrapperPass>();
3583     AU.addRequired<LoopInfoWrapperPass>();
3584     AU.addRequired<DominatorTreeWrapperPass>();
3585     AU.addRequired<DemandedBitsWrapperPass>();
3586     AU.addPreserved<LoopInfoWrapperPass>();
3587     AU.addPreserved<DominatorTreeWrapperPass>();
3588     AU.addPreserved<AAResultsWrapperPass>();
3589     AU.addPreserved<GlobalsAAWrapperPass>();
3590     AU.setPreservesCFG();
3591   }
3592 
3593 private:
3594   /// \brief Collect store and getelementptr instructions and organize them
3595   /// according to the underlying object of their pointer operands. We sort the
3596   /// instructions by their underlying objects to reduce the cost of
3597   /// consecutive access queries.
3598   ///
3599   /// TODO: We can further reduce this cost if we flush the chain creation
3600   ///       every time we run into a memory barrier.
3601   void collectSeedInstructions(BasicBlock *BB);
3602 
3603   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
3604   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
3605 
3606   /// \brief Try to vectorize a list of operands.
3607   /// \@param BuildVector A list of users to ignore for the purpose of
3608   ///                     scheduling and that don't need extracting.
3609   /// \returns true if a value was vectorized.
3610   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3611                           ArrayRef<Value *> BuildVector = None,
3612                           bool allowReorder = false);
3613 
3614   /// \brief Try to vectorize a chain that may start at the operands of \V;
3615   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
3616 
3617   /// \brief Vectorize the store instructions collected in Stores.
3618   bool vectorizeStoreChains(BoUpSLP &R);
3619 
3620   /// \brief Vectorize the index computations of the getelementptr instructions
3621   /// collected in GEPs.
3622   bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R);
3623 
3624   /// \brief Scan the basic block and look for patterns that are likely to start
3625   /// a vectorization chain.
3626   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
3627 
3628   bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
3629                            BoUpSLP &R, unsigned VecRegSize);
3630 
3631   bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
3632                        BoUpSLP &R);
3633 
3634   /// The store instructions in a basic block organized by base pointer.
3635   StoreListMap Stores;
3636 
3637   /// The getelementptr instructions in a basic block organized by base pointer.
3638   WeakVHListMap GEPs;
3639 };
3640 
3641 /// \brief Check that the Values in the slice in VL array are still existent in
3642 /// the WeakVH array.
3643 /// Vectorization of part of the VL array may cause later values in the VL array
3644 /// to become invalid. We track when this has happened in the WeakVH array.
3645 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
3646                                unsigned SliceBegin, unsigned SliceSize) {
3647   VL = VL.slice(SliceBegin, SliceSize);
3648   VH = VH.slice(SliceBegin, SliceSize);
3649   return !std::equal(VL.begin(), VL.end(), VH.begin());
3650 }
3651 
3652 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
3653                                         int CostThreshold, BoUpSLP &R,
3654                                         unsigned VecRegSize) {
3655   unsigned ChainLen = Chain.size();
3656   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
3657         << "\n");
3658   unsigned Sz = R.getVectorElementSize(Chain[0]);
3659   unsigned VF = VecRegSize / Sz;
3660 
3661   if (!isPowerOf2_32(Sz) || VF < 2)
3662     return false;
3663 
3664   // Keep track of values that were deleted by vectorizing in the loop below.
3665   SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
3666 
3667   bool Changed = false;
3668   // Look for profitable vectorizable trees at all offsets, starting at zero.
3669   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
3670     if (i + VF > e)
3671       break;
3672 
3673     // Check that a previous iteration of this loop did not delete the Value.
3674     if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3675       continue;
3676 
3677     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
3678           << "\n");
3679     ArrayRef<Value *> Operands = Chain.slice(i, VF);
3680 
3681     R.buildTree(Operands);
3682     R.computeMinimumValueSizes();
3683 
3684     int Cost = R.getTreeCost();
3685 
3686     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
3687     if (Cost < CostThreshold) {
3688       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3689       R.vectorizeTree();
3690 
3691       // Move to the next bundle.
3692       i += VF - 1;
3693       Changed = true;
3694     }
3695   }
3696 
3697   return Changed;
3698 }
3699 
3700 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
3701                                     int costThreshold, BoUpSLP &R) {
3702   SetVector<StoreInst *> Heads, Tails;
3703   SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
3704 
3705   // We may run into multiple chains that merge into a single chain. We mark the
3706   // stores that we vectorized so that we don't visit the same store twice.
3707   BoUpSLP::ValueSet VectorizedStores;
3708   bool Changed = false;
3709 
3710   // Do a quadratic search on all of the given stores and find
3711   // all of the pairs of stores that follow each other.
3712   SmallVector<unsigned, 16> IndexQueue;
3713   for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3714     IndexQueue.clear();
3715     // If a store has multiple consecutive store candidates, search Stores
3716     // array according to the sequence: from i+1 to e, then from i-1 to 0.
3717     // This is because usually pairing with immediate succeeding or preceding
3718     // candidate create the best chance to find slp vectorization opportunity.
3719     unsigned j = 0;
3720     for (j = i + 1; j < e; ++j)
3721       IndexQueue.push_back(j);
3722     for (j = i; j > 0; --j)
3723       IndexQueue.push_back(j - 1);
3724 
3725     for (auto &k : IndexQueue) {
3726       if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
3727         Tails.insert(Stores[k]);
3728         Heads.insert(Stores[i]);
3729         ConsecutiveChain[Stores[i]] = Stores[k];
3730         break;
3731       }
3732     }
3733   }
3734 
3735   // For stores that start but don't end a link in the chain:
3736   for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
3737        it != e; ++it) {
3738     if (Tails.count(*it))
3739       continue;
3740 
3741     // We found a store instr that starts a chain. Now follow the chain and try
3742     // to vectorize it.
3743     BoUpSLP::ValueList Operands;
3744     StoreInst *I = *it;
3745     // Collect the chain into a list.
3746     while (Tails.count(I) || Heads.count(I)) {
3747       if (VectorizedStores.count(I))
3748         break;
3749       Operands.push_back(I);
3750       // Move to the next value in the chain.
3751       I = ConsecutiveChain[I];
3752     }
3753 
3754     // FIXME: Is division-by-2 the correct step? Should we assert that the
3755     // register size is a power-of-2?
3756     for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) {
3757       if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
3758         // Mark the vectorized stores so that we don't vectorize them again.
3759         VectorizedStores.insert(Operands.begin(), Operands.end());
3760         Changed = true;
3761         break;
3762       }
3763     }
3764   }
3765 
3766   return Changed;
3767 }
3768 
3769 void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) {
3770 
3771   // Initialize the collections. We will make a single pass over the block.
3772   Stores.clear();
3773   GEPs.clear();
3774 
3775   // Visit the store and getelementptr instructions in BB and organize them in
3776   // Stores and GEPs according to the underlying objects of their pointer
3777   // operands.
3778   for (Instruction &I : *BB) {
3779 
3780     // Ignore store instructions that are volatile or have a pointer operand
3781     // that doesn't point to a scalar type.
3782     if (auto *SI = dyn_cast<StoreInst>(&I)) {
3783       if (!SI->isSimple())
3784         continue;
3785       if (!isValidElementType(SI->getValueOperand()->getType()))
3786         continue;
3787       Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
3788     }
3789 
3790     // Ignore getelementptr instructions that have more than one index, a
3791     // constant index, or a pointer operand that doesn't point to a scalar
3792     // type.
3793     else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
3794       auto Idx = GEP->idx_begin()->get();
3795       if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
3796         continue;
3797       if (!isValidElementType(Idx->getType()))
3798         continue;
3799       GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
3800     }
3801   }
3802 }
3803 
3804 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3805   if (!A || !B)
3806     return false;
3807   Value *VL[] = { A, B };
3808   return tryToVectorizeList(VL, R, None, true);
3809 }
3810 
3811 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3812                                        ArrayRef<Value *> BuildVector,
3813                                        bool allowReorder) {
3814   if (VL.size() < 2)
3815     return false;
3816 
3817   DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
3818 
3819   // Check that all of the parts are scalar instructions of the same type.
3820   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3821   if (!I0)
3822     return false;
3823 
3824   unsigned Opcode0 = I0->getOpcode();
3825 
3826   // FIXME: Register size should be a parameter to this function, so we can
3827   // try different vectorization factors.
3828   unsigned Sz = R.getVectorElementSize(I0);
3829   unsigned VF = R.getMinVecRegSize() / Sz;
3830 
3831   for (Value *V : VL) {
3832     Type *Ty = V->getType();
3833     if (!isValidElementType(Ty))
3834       return false;
3835     Instruction *Inst = dyn_cast<Instruction>(V);
3836     if (!Inst || Inst->getOpcode() != Opcode0)
3837       return false;
3838   }
3839 
3840   bool Changed = false;
3841 
3842   // Keep track of values that were deleted by vectorizing in the loop below.
3843   SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3844 
3845   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3846     unsigned OpsWidth = 0;
3847 
3848     if (i + VF > e)
3849       OpsWidth = e - i;
3850     else
3851       OpsWidth = VF;
3852 
3853     if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3854       break;
3855 
3856     // Check that a previous iteration of this loop did not delete the Value.
3857     if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3858       continue;
3859 
3860     DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3861                  << "\n");
3862     ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
3863 
3864     ArrayRef<Value *> BuildVectorSlice;
3865     if (!BuildVector.empty())
3866       BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3867 
3868     R.buildTree(Ops, BuildVectorSlice);
3869     // TODO: check if we can allow reordering also for other cases than
3870     // tryToVectorizePair()
3871     if (allowReorder && R.shouldReorder()) {
3872       assert(Ops.size() == 2);
3873       assert(BuildVectorSlice.empty());
3874       Value *ReorderedOps[] = { Ops[1], Ops[0] };
3875       R.buildTree(ReorderedOps, None);
3876     }
3877     R.computeMinimumValueSizes();
3878     int Cost = R.getTreeCost();
3879 
3880     if (Cost < -SLPCostThreshold) {
3881       DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3882       Value *VectorizedRoot = R.vectorizeTree();
3883 
3884       // Reconstruct the build vector by extracting the vectorized root. This
3885       // way we handle the case where some elements of the vector are undefined.
3886       //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3887       if (!BuildVectorSlice.empty()) {
3888         // The insert point is the last build vector instruction. The vectorized
3889         // root will precede it. This guarantees that we get an instruction. The
3890         // vectorized tree could have been constant folded.
3891         Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3892         unsigned VecIdx = 0;
3893         for (auto &V : BuildVectorSlice) {
3894           IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
3895                                       ++BasicBlock::iterator(InsertAfter));
3896           Instruction *I = cast<Instruction>(V);
3897           assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
3898           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3899               VectorizedRoot, Builder.getInt32(VecIdx++)));
3900           I->setOperand(1, Extract);
3901           I->removeFromParent();
3902           I->insertAfter(Extract);
3903           InsertAfter = I;
3904         }
3905       }
3906       // Move to the next bundle.
3907       i += VF - 1;
3908       Changed = true;
3909     }
3910   }
3911 
3912   return Changed;
3913 }
3914 
3915 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
3916   if (!V)
3917     return false;
3918 
3919   // Try to vectorize V.
3920   if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
3921     return true;
3922 
3923   BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
3924   BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
3925   // Try to skip B.
3926   if (B && B->hasOneUse()) {
3927     BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
3928     BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
3929     if (tryToVectorizePair(A, B0, R)) {
3930       return true;
3931     }
3932     if (tryToVectorizePair(A, B1, R)) {
3933       return true;
3934     }
3935   }
3936 
3937   // Try to skip A.
3938   if (A && A->hasOneUse()) {
3939     BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
3940     BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
3941     if (tryToVectorizePair(A0, B, R)) {
3942       return true;
3943     }
3944     if (tryToVectorizePair(A1, B, R)) {
3945       return true;
3946     }
3947   }
3948   return 0;
3949 }
3950 
3951 /// \brief Generate a shuffle mask to be used in a reduction tree.
3952 ///
3953 /// \param VecLen The length of the vector to be reduced.
3954 /// \param NumEltsToRdx The number of elements that should be reduced in the
3955 ///        vector.
3956 /// \param IsPairwise Whether the reduction is a pairwise or splitting
3957 ///        reduction. A pairwise reduction will generate a mask of
3958 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
3959 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
3960 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
3961 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
3962                                    bool IsPairwise, bool IsLeft,
3963                                    IRBuilder<> &Builder) {
3964   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
3965 
3966   SmallVector<Constant *, 32> ShuffleMask(
3967       VecLen, UndefValue::get(Builder.getInt32Ty()));
3968 
3969   if (IsPairwise)
3970     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
3971     for (unsigned i = 0; i != NumEltsToRdx; ++i)
3972       ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
3973   else
3974     // Move the upper half of the vector to the lower half.
3975     for (unsigned i = 0; i != NumEltsToRdx; ++i)
3976       ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
3977 
3978   return ConstantVector::get(ShuffleMask);
3979 }
3980 
3981 
3982 /// Model horizontal reductions.
3983 ///
3984 /// A horizontal reduction is a tree of reduction operations (currently add and
3985 /// fadd) that has operations that can be put into a vector as its leaf.
3986 /// For example, this tree:
3987 ///
3988 /// mul mul mul mul
3989 ///  \  /    \  /
3990 ///   +       +
3991 ///    \     /
3992 ///       +
3993 /// This tree has "mul" as its reduced values and "+" as its reduction
3994 /// operations. A reduction might be feeding into a store or a binary operation
3995 /// feeding a phi.
3996 ///    ...
3997 ///    \  /
3998 ///     +
3999 ///     |
4000 ///  phi +=
4001 ///
4002 ///  Or:
4003 ///    ...
4004 ///    \  /
4005 ///     +
4006 ///     |
4007 ///   *p =
4008 ///
4009 class HorizontalReduction {
4010   SmallVector<Value *, 16> ReductionOps;
4011   SmallVector<Value *, 32> ReducedVals;
4012 
4013   BinaryOperator *ReductionRoot;
4014   PHINode *ReductionPHI;
4015 
4016   /// The opcode of the reduction.
4017   unsigned ReductionOpcode;
4018   /// The opcode of the values we perform a reduction on.
4019   unsigned ReducedValueOpcode;
4020   /// Should we model this reduction as a pairwise reduction tree or a tree that
4021   /// splits the vector in halves and adds those halves.
4022   bool IsPairwiseReduction;
4023 
4024 public:
4025   /// The width of one full horizontal reduction operation.
4026   unsigned ReduxWidth;
4027 
4028   /// Minimal width of available vector registers. It's used to determine
4029   /// ReduxWidth.
4030   unsigned MinVecRegSize;
4031 
4032   HorizontalReduction(unsigned MinVecRegSize)
4033       : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
4034         ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
4035         MinVecRegSize(MinVecRegSize) {}
4036 
4037   /// \brief Try to find a reduction tree.
4038   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
4039     assert((!Phi ||
4040             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
4041            "Thi phi needs to use the binary operator");
4042 
4043     // We could have a initial reductions that is not an add.
4044     //  r *= v1 + v2 + v3 + v4
4045     // In such a case start looking for a tree rooted in the first '+'.
4046     if (Phi) {
4047       if (B->getOperand(0) == Phi) {
4048         Phi = nullptr;
4049         B = dyn_cast<BinaryOperator>(B->getOperand(1));
4050       } else if (B->getOperand(1) == Phi) {
4051         Phi = nullptr;
4052         B = dyn_cast<BinaryOperator>(B->getOperand(0));
4053       }
4054     }
4055 
4056     if (!B)
4057       return false;
4058 
4059     Type *Ty = B->getType();
4060     if (!isValidElementType(Ty))
4061       return false;
4062 
4063     const DataLayout &DL = B->getModule()->getDataLayout();
4064     ReductionOpcode = B->getOpcode();
4065     ReducedValueOpcode = 0;
4066     // FIXME: Register size should be a parameter to this function, so we can
4067     // try different vectorization factors.
4068     ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
4069     ReductionRoot = B;
4070     ReductionPHI = Phi;
4071 
4072     if (ReduxWidth < 4)
4073       return false;
4074 
4075     // We currently only support adds.
4076     if (ReductionOpcode != Instruction::Add &&
4077         ReductionOpcode != Instruction::FAdd)
4078       return false;
4079 
4080     // Post order traverse the reduction tree starting at B. We only handle true
4081     // trees containing only binary operators or selects.
4082     SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
4083     Stack.push_back(std::make_pair(B, 0));
4084     while (!Stack.empty()) {
4085       Instruction *TreeN = Stack.back().first;
4086       unsigned EdgeToVist = Stack.back().second++;
4087       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
4088 
4089       // Only handle trees in the current basic block.
4090       if (TreeN->getParent() != B->getParent())
4091         return false;
4092 
4093       // Each tree node needs to have one user except for the ultimate
4094       // reduction.
4095       if (!TreeN->hasOneUse() && TreeN != B)
4096         return false;
4097 
4098       // Postorder vist.
4099       if (EdgeToVist == 2 || IsReducedValue) {
4100         if (IsReducedValue) {
4101           // Make sure that the opcodes of the operations that we are going to
4102           // reduce match.
4103           if (!ReducedValueOpcode)
4104             ReducedValueOpcode = TreeN->getOpcode();
4105           else if (ReducedValueOpcode != TreeN->getOpcode())
4106             return false;
4107           ReducedVals.push_back(TreeN);
4108         } else {
4109           // We need to be able to reassociate the adds.
4110           if (!TreeN->isAssociative())
4111             return false;
4112           ReductionOps.push_back(TreeN);
4113         }
4114         // Retract.
4115         Stack.pop_back();
4116         continue;
4117       }
4118 
4119       // Visit left or right.
4120       Value *NextV = TreeN->getOperand(EdgeToVist);
4121       // We currently only allow BinaryOperator's and SelectInst's as reduction
4122       // values in our tree.
4123       if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV))
4124         Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));
4125       else if (NextV != Phi)
4126         return false;
4127     }
4128     return true;
4129   }
4130 
4131   /// \brief Attempt to vectorize the tree found by
4132   /// matchAssociativeReduction.
4133   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
4134     if (ReducedVals.empty())
4135       return false;
4136 
4137     unsigned NumReducedVals = ReducedVals.size();
4138     if (NumReducedVals < ReduxWidth)
4139       return false;
4140 
4141     Value *VectorizedTree = nullptr;
4142     IRBuilder<> Builder(ReductionRoot);
4143     FastMathFlags Unsafe;
4144     Unsafe.setUnsafeAlgebra();
4145     Builder.setFastMathFlags(Unsafe);
4146     unsigned i = 0;
4147 
4148     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
4149       V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
4150       V.computeMinimumValueSizes();
4151 
4152       // Estimate cost.
4153       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
4154       if (Cost >= -SLPCostThreshold)
4155         break;
4156 
4157       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
4158                    << ". (HorRdx)\n");
4159 
4160       // Vectorize a tree.
4161       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
4162       Value *VectorizedRoot = V.vectorizeTree();
4163 
4164       // Emit a reduction.
4165       Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
4166       if (VectorizedTree) {
4167         Builder.SetCurrentDebugLocation(Loc);
4168         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
4169                                      ReducedSubTree, "bin.rdx");
4170       } else
4171         VectorizedTree = ReducedSubTree;
4172     }
4173 
4174     if (VectorizedTree) {
4175       // Finish the reduction.
4176       for (; i < NumReducedVals; ++i) {
4177         Builder.SetCurrentDebugLocation(
4178           cast<Instruction>(ReducedVals[i])->getDebugLoc());
4179         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
4180                                      ReducedVals[i]);
4181       }
4182       // Update users.
4183       if (ReductionPHI) {
4184         assert(ReductionRoot && "Need a reduction operation");
4185         ReductionRoot->setOperand(0, VectorizedTree);
4186         ReductionRoot->setOperand(1, ReductionPHI);
4187       } else
4188         ReductionRoot->replaceAllUsesWith(VectorizedTree);
4189     }
4190     return VectorizedTree != nullptr;
4191   }
4192 
4193   unsigned numReductionValues() const {
4194     return ReducedVals.size();
4195   }
4196 
4197 private:
4198   /// \brief Calculate the cost of a reduction.
4199   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
4200     Type *ScalarTy = FirstReducedVal->getType();
4201     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
4202 
4203     int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
4204     int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
4205 
4206     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
4207     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
4208 
4209     int ScalarReduxCost =
4210         ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
4211 
4212     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
4213                  << " for reduction that starts with " << *FirstReducedVal
4214                  << " (It is a "
4215                  << (IsPairwiseReduction ? "pairwise" : "splitting")
4216                  << " reduction)\n");
4217 
4218     return VecReduxCost - ScalarReduxCost;
4219   }
4220 
4221   static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
4222                             Value *R, const Twine &Name = "") {
4223     if (Opcode == Instruction::FAdd)
4224       return Builder.CreateFAdd(L, R, Name);
4225     return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
4226   }
4227 
4228   /// \brief Emit a horizontal reduction of the vectorized value.
4229   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
4230     assert(VectorizedValue && "Need to have a vectorized tree node");
4231     assert(isPowerOf2_32(ReduxWidth) &&
4232            "We only handle power-of-two reductions for now");
4233 
4234     Value *TmpVec = VectorizedValue;
4235     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
4236       if (IsPairwiseReduction) {
4237         Value *LeftMask =
4238           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
4239         Value *RightMask =
4240           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
4241 
4242         Value *LeftShuf = Builder.CreateShuffleVector(
4243           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
4244         Value *RightShuf = Builder.CreateShuffleVector(
4245           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
4246           "rdx.shuf.r");
4247         TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
4248                              "bin.rdx");
4249       } else {
4250         Value *UpperHalf =
4251           createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
4252         Value *Shuf = Builder.CreateShuffleVector(
4253           TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
4254         TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
4255       }
4256     }
4257 
4258     // The result is in the first element of the vector.
4259     return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
4260   }
4261 };
4262 
4263 /// \brief Recognize construction of vectors like
4264 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
4265 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
4266 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
4267 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
4268 ///
4269 /// Returns true if it matches
4270 ///
4271 static bool findBuildVector(InsertElementInst *FirstInsertElem,
4272                             SmallVectorImpl<Value *> &BuildVector,
4273                             SmallVectorImpl<Value *> &BuildVectorOpds) {
4274   if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
4275     return false;
4276 
4277   InsertElementInst *IE = FirstInsertElem;
4278   while (true) {
4279     BuildVector.push_back(IE);
4280     BuildVectorOpds.push_back(IE->getOperand(1));
4281 
4282     if (IE->use_empty())
4283       return false;
4284 
4285     InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
4286     if (!NextUse)
4287       return true;
4288 
4289     // If this isn't the final use, make sure the next insertelement is the only
4290     // use. It's OK if the final constructed vector is used multiple times
4291     if (!IE->hasOneUse())
4292       return false;
4293 
4294     IE = NextUse;
4295   }
4296 
4297   return false;
4298 }
4299 
4300 /// \brief Like findBuildVector, but looks backwards for construction of aggregate.
4301 ///
4302 /// \return true if it matches.
4303 static bool findBuildAggregate(InsertValueInst *IV,
4304                                SmallVectorImpl<Value *> &BuildVector,
4305                                SmallVectorImpl<Value *> &BuildVectorOpds) {
4306   if (!IV->hasOneUse())
4307     return false;
4308   Value *V = IV->getAggregateOperand();
4309   if (!isa<UndefValue>(V)) {
4310     InsertValueInst *I = dyn_cast<InsertValueInst>(V);
4311     if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
4312       return false;
4313   }
4314   BuildVector.push_back(IV);
4315   BuildVectorOpds.push_back(IV->getInsertedValueOperand());
4316   return true;
4317 }
4318 
4319 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
4320   return V->getType() < V2->getType();
4321 }
4322 
4323 /// \brief Try and get a reduction value from a phi node.
4324 ///
4325 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
4326 /// if they come from either \p ParentBB or a containing loop latch.
4327 ///
4328 /// \returns A candidate reduction value if possible, or \code nullptr \endcode
4329 /// if not possible.
4330 static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
4331                                 BasicBlock *ParentBB, LoopInfo *LI) {
4332   // There are situations where the reduction value is not dominated by the
4333   // reduction phi. Vectorizing such cases has been reported to cause
4334   // miscompiles. See PR25787.
4335   auto DominatedReduxValue = [&](Value *R) {
4336     return (
4337         dyn_cast<Instruction>(R) &&
4338         DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
4339   };
4340 
4341   Value *Rdx = nullptr;
4342 
4343   // Return the incoming value if it comes from the same BB as the phi node.
4344   if (P->getIncomingBlock(0) == ParentBB) {
4345     Rdx = P->getIncomingValue(0);
4346   } else if (P->getIncomingBlock(1) == ParentBB) {
4347     Rdx = P->getIncomingValue(1);
4348   }
4349 
4350   if (Rdx && DominatedReduxValue(Rdx))
4351     return Rdx;
4352 
4353   // Otherwise, check whether we have a loop latch to look at.
4354   Loop *BBL = LI->getLoopFor(ParentBB);
4355   if (!BBL)
4356     return nullptr;
4357   BasicBlock *BBLatch = BBL->getLoopLatch();
4358   if (!BBLatch)
4359     return nullptr;
4360 
4361   // There is a loop latch, return the incoming value if it comes from
4362   // that. This reduction pattern occassionaly turns up.
4363   if (P->getIncomingBlock(0) == BBLatch) {
4364     Rdx = P->getIncomingValue(0);
4365   } else if (P->getIncomingBlock(1) == BBLatch) {
4366     Rdx = P->getIncomingValue(1);
4367   }
4368 
4369   if (Rdx && DominatedReduxValue(Rdx))
4370     return Rdx;
4371 
4372   return nullptr;
4373 }
4374 
4375 /// \brief Attempt to reduce a horizontal reduction.
4376 /// If it is legal to match a horizontal reduction feeding
4377 /// the phi node P with reduction operators BI, then check if it
4378 /// can be done.
4379 /// \returns true if a horizontal reduction was matched and reduced.
4380 /// \returns false if a horizontal reduction was not matched.
4381 static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
4382                                         BoUpSLP &R, TargetTransformInfo *TTI,
4383                                         unsigned MinRegSize) {
4384   if (!ShouldVectorizeHor)
4385     return false;
4386 
4387   HorizontalReduction HorRdx(MinRegSize);
4388   if (!HorRdx.matchAssociativeReduction(P, BI))
4389     return false;
4390 
4391   // If there is a sufficient number of reduction values, reduce
4392   // to a nearby power-of-2. Can safely generate oversized
4393   // vectors and rely on the backend to split them to legal sizes.
4394   HorRdx.ReduxWidth =
4395     std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
4396 
4397   return HorRdx.tryToReduce(R, TTI);
4398 }
4399 
4400 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
4401   bool Changed = false;
4402   SmallVector<Value *, 4> Incoming;
4403   SmallSet<Value *, 16> VisitedInstrs;
4404 
4405   bool HaveVectorizedPhiNodes = true;
4406   while (HaveVectorizedPhiNodes) {
4407     HaveVectorizedPhiNodes = false;
4408 
4409     // Collect the incoming values from the PHIs.
4410     Incoming.clear();
4411     for (Instruction &I : *BB) {
4412       PHINode *P = dyn_cast<PHINode>(&I);
4413       if (!P)
4414         break;
4415 
4416       if (!VisitedInstrs.count(P))
4417         Incoming.push_back(P);
4418     }
4419 
4420     // Sort by type.
4421     std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
4422 
4423     // Try to vectorize elements base on their type.
4424     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
4425                                            E = Incoming.end();
4426          IncIt != E;) {
4427 
4428       // Look for the next elements with the same type.
4429       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
4430       while (SameTypeIt != E &&
4431              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
4432         VisitedInstrs.insert(*SameTypeIt);
4433         ++SameTypeIt;
4434       }
4435 
4436       // Try to vectorize them.
4437       unsigned NumElts = (SameTypeIt - IncIt);
4438       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
4439       if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
4440         // Success start over because instructions might have been changed.
4441         HaveVectorizedPhiNodes = true;
4442         Changed = true;
4443         break;
4444       }
4445 
4446       // Start over at the next instruction of a different type (or the end).
4447       IncIt = SameTypeIt;
4448     }
4449   }
4450 
4451   VisitedInstrs.clear();
4452 
4453   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
4454     // We may go through BB multiple times so skip the one we have checked.
4455     if (!VisitedInstrs.insert(&*it).second)
4456       continue;
4457 
4458     if (isa<DbgInfoIntrinsic>(it))
4459       continue;
4460 
4461     // Try to vectorize reductions that use PHINodes.
4462     if (PHINode *P = dyn_cast<PHINode>(it)) {
4463       // Check that the PHI is a reduction PHI.
4464       if (P->getNumIncomingValues() != 2)
4465         return Changed;
4466 
4467       Value *Rdx = getReductionValue(DT, P, BB, LI);
4468 
4469       // Check if this is a Binary Operator.
4470       BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
4471       if (!BI)
4472         continue;
4473 
4474       // Try to match and vectorize a horizontal reduction.
4475       if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
4476         Changed = true;
4477         it = BB->begin();
4478         e = BB->end();
4479         continue;
4480       }
4481 
4482      Value *Inst = BI->getOperand(0);
4483       if (Inst == P)
4484         Inst = BI->getOperand(1);
4485 
4486       if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
4487         // We would like to start over since some instructions are deleted
4488         // and the iterator may become invalid value.
4489         Changed = true;
4490         it = BB->begin();
4491         e = BB->end();
4492         continue;
4493       }
4494 
4495       continue;
4496     }
4497 
4498     if (ShouldStartVectorizeHorAtStore)
4499       if (StoreInst *SI = dyn_cast<StoreInst>(it))
4500         if (BinaryOperator *BinOp =
4501                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
4502           if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
4503                                           R.getMinVecRegSize()) ||
4504               tryToVectorize(BinOp, R)) {
4505             Changed = true;
4506             it = BB->begin();
4507             e = BB->end();
4508             continue;
4509           }
4510         }
4511 
4512     // Try to vectorize horizontal reductions feeding into a return.
4513     if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
4514       if (RI->getNumOperands() != 0)
4515         if (BinaryOperator *BinOp =
4516                 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
4517           DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
4518           if (tryToVectorizePair(BinOp->getOperand(0),
4519                                  BinOp->getOperand(1), R)) {
4520             Changed = true;
4521             it = BB->begin();
4522             e = BB->end();
4523             continue;
4524           }
4525         }
4526 
4527     // Try to vectorize trees that start at compare instructions.
4528     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
4529       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
4530         Changed = true;
4531         // We would like to start over since some instructions are deleted
4532         // and the iterator may become invalid value.
4533         it = BB->begin();
4534         e = BB->end();
4535         continue;
4536       }
4537 
4538       for (int i = 0; i < 2; ++i) {
4539         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
4540           if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
4541             Changed = true;
4542             // We would like to start over since some instructions are deleted
4543             // and the iterator may become invalid value.
4544             it = BB->begin();
4545             e = BB->end();
4546             break;
4547           }
4548         }
4549       }
4550       continue;
4551     }
4552 
4553     // Try to vectorize trees that start at insertelement instructions.
4554     if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
4555       SmallVector<Value *, 16> BuildVector;
4556       SmallVector<Value *, 16> BuildVectorOpds;
4557       if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
4558         continue;
4559 
4560       // Vectorize starting with the build vector operands ignoring the
4561       // BuildVector instructions for the purpose of scheduling and user
4562       // extraction.
4563       if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
4564         Changed = true;
4565         it = BB->begin();
4566         e = BB->end();
4567       }
4568 
4569       continue;
4570     }
4571 
4572     // Try to vectorize trees that start at insertvalue instructions feeding into
4573     // a store.
4574     if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
4575       if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
4576         const DataLayout &DL = BB->getModule()->getDataLayout();
4577         if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
4578           SmallVector<Value *, 16> BuildVector;
4579           SmallVector<Value *, 16> BuildVectorOpds;
4580           if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
4581             continue;
4582 
4583           DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
4584           if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
4585             Changed = true;
4586             it = BB->begin();
4587             e = BB->end();
4588           }
4589           continue;
4590         }
4591       }
4592     }
4593   }
4594 
4595   return Changed;
4596 }
4597 
4598 bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
4599   auto Changed = false;
4600   for (auto &Entry : GEPs) {
4601 
4602     // If the getelementptr list has fewer than two elements, there's nothing
4603     // to do.
4604     if (Entry.second.size() < 2)
4605       continue;
4606 
4607     DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
4608                  << Entry.second.size() << ".\n");
4609 
4610     // We process the getelementptr list in chunks of 16 (like we do for
4611     // stores) to minimize compile-time.
4612     for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
4613       auto Len = std::min<unsigned>(BE - BI, 16);
4614       auto GEPList = makeArrayRef(&Entry.second[BI], Len);
4615 
4616       // Initialize a set a candidate getelementptrs. Note that we use a
4617       // SetVector here to preserve program order. If the index computations
4618       // are vectorizable and begin with loads, we want to minimize the chance
4619       // of having to reorder them later.
4620       SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
4621 
4622       // Some of the candidates may have already been vectorized after we
4623       // initially collected them. If so, the WeakVHs will have nullified the
4624       // values, so remove them from the set of candidates.
4625       Candidates.remove(nullptr);
4626 
4627       // Remove from the set of candidates all pairs of getelementptrs with
4628       // constant differences. Such getelementptrs are likely not good
4629       // candidates for vectorization in a bottom-up phase since one can be
4630       // computed from the other. We also ensure all candidate getelementptr
4631       // indices are unique.
4632       for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
4633         auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
4634         if (!Candidates.count(GEPI))
4635           continue;
4636         auto *SCEVI = SE->getSCEV(GEPList[I]);
4637         for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
4638           auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
4639           auto *SCEVJ = SE->getSCEV(GEPList[J]);
4640           if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
4641             Candidates.remove(GEPList[I]);
4642             Candidates.remove(GEPList[J]);
4643           } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
4644             Candidates.remove(GEPList[J]);
4645           }
4646         }
4647       }
4648 
4649       // We break out of the above computation as soon as we know there are
4650       // fewer than two candidates remaining.
4651       if (Candidates.size() < 2)
4652         continue;
4653 
4654       // Add the single, non-constant index of each candidate to the bundle. We
4655       // ensured the indices met these constraints when we originally collected
4656       // the getelementptrs.
4657       SmallVector<Value *, 16> Bundle(Candidates.size());
4658       auto BundleIndex = 0u;
4659       for (auto *V : Candidates) {
4660         auto *GEP = cast<GetElementPtrInst>(V);
4661         auto *GEPIdx = GEP->idx_begin()->get();
4662         assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
4663         Bundle[BundleIndex++] = GEPIdx;
4664       }
4665 
4666       // Try and vectorize the indices. We are currently only interested in
4667       // gather-like cases of the form:
4668       //
4669       // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
4670       //
4671       // where the loads of "a", the loads of "b", and the subtractions can be
4672       // performed in parallel. It's likely that detecting this pattern in a
4673       // bottom-up phase will be simpler and less costly than building a
4674       // full-blown top-down phase beginning at the consecutive loads.
4675       Changed |= tryToVectorizeList(Bundle, R);
4676     }
4677   }
4678   return Changed;
4679 }
4680 
4681 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
4682   bool Changed = false;
4683   // Attempt to sort and vectorize each of the store-groups.
4684   for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
4685        ++it) {
4686     if (it->second.size() < 2)
4687       continue;
4688 
4689     DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
4690           << it->second.size() << ".\n");
4691 
4692     // Process the stores in chunks of 16.
4693     // TODO: The limit of 16 inhibits greater vectorization factors.
4694     //       For example, AVX2 supports v32i8. Increasing this limit, however,
4695     //       may cause a significant compile-time increase.
4696     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
4697       unsigned Len = std::min<unsigned>(CE - CI, 16);
4698       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
4699                                  -SLPCostThreshold, R);
4700     }
4701   }
4702   return Changed;
4703 }
4704 
4705 } // end anonymous namespace
4706 
4707 char SLPVectorizer::ID = 0;
4708 static const char lv_name[] = "SLP Vectorizer";
4709 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
4710 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
4711 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
4712 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
4713 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
4714 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4715 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
4716 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
4717 
4718 namespace llvm {
4719 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
4720 }
4721