1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10 // stores that can be put together into vector-stores. Next, it attempts to
11 // construct vectorizable tree using the use-def chains. If a profitable tree
12 // was found, the SLP vectorizer performs vectorization on the tree.
13 //
14 // The pass is inspired by the work described in the paper:
15 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16 //
17 //===----------------------------------------------------------------------===//
18 #include "llvm/Transforms/Vectorize.h"
19 #include "llvm/ADT/MapVector.h"
20 #include "llvm/ADT/Optional.h"
21 #include "llvm/ADT/PostOrderIterator.h"
22 #include "llvm/ADT/SetVector.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/AliasAnalysis.h"
25 #include "llvm/Analysis/GlobalsModRef.h"
26 #include "llvm/Analysis/AssumptionCache.h"
27 #include "llvm/Analysis/CodeMetrics.h"
28 #include "llvm/Analysis/LoopInfo.h"
29 #include "llvm/Analysis/LoopAccessAnalysis.h"
30 #include "llvm/Analysis/ScalarEvolution.h"
31 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
32 #include "llvm/Analysis/TargetTransformInfo.h"
33 #include "llvm/Analysis/ValueTracking.h"
34 #include "llvm/IR/DataLayout.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/IRBuilder.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Module.h"
40 #include "llvm/IR/NoFolder.h"
41 #include "llvm/IR/Type.h"
42 #include "llvm/IR/Value.h"
43 #include "llvm/IR/Verifier.h"
44 #include "llvm/Pass.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/raw_ostream.h"
48 #include "llvm/Analysis/VectorUtils.h"
49 #include <algorithm>
50 #include <map>
51 #include <memory>
52 
53 using namespace llvm;
54 
55 #define SV_NAME "slp-vectorizer"
56 #define DEBUG_TYPE "SLP"
57 
58 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
59 
60 static cl::opt<int>
61     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
62                      cl::desc("Only vectorize if you gain more than this "
63                               "number "));
64 
65 static cl::opt<bool>
66 ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
67                    cl::desc("Attempt to vectorize horizontal reductions"));
68 
69 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
70     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
71     cl::desc(
72         "Attempt to vectorize horizontal reductions feeding into a store"));
73 
74 static cl::opt<int>
75 MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
76     cl::desc("Attempt to vectorize for this register size in bits"));
77 
78 /// Limits the size of scheduling regions in a block.
79 /// It avoid long compile times for _very_ large blocks where vector
80 /// instructions are spread over a wide range.
81 /// This limit is way higher than needed by real-world functions.
82 static cl::opt<int>
83 ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
84     cl::desc("Limit the size of the SLP scheduling region per block"));
85 
86 namespace {
87 
88 // FIXME: Set this via cl::opt to allow overriding.
89 static const unsigned MinVecRegSize = 128;
90 
91 static const unsigned RecursionMaxDepth = 12;
92 
93 // Limit the number of alias checks. The limit is chosen so that
94 // it has no negative effect on the llvm benchmarks.
95 static const unsigned AliasedCheckLimit = 10;
96 
97 // Another limit for the alias checks: The maximum distance between load/store
98 // instructions where alias checks are done.
99 // This limit is useful for very large basic blocks.
100 static const unsigned MaxMemDepDistance = 160;
101 
102 /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
103 /// regions to be handled.
104 static const int MinScheduleRegionSize = 16;
105 
106 /// \brief Predicate for the element types that the SLP vectorizer supports.
107 ///
108 /// The most important thing to filter here are types which are invalid in LLVM
109 /// vectors. We also filter target specific types which have absolutely no
110 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
111 /// avoids spending time checking the cost model and realizing that they will
112 /// be inevitably scalarized.
113 static bool isValidElementType(Type *Ty) {
114   return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
115          !Ty->isPPC_FP128Ty();
116 }
117 
118 /// \returns the parent basic block if all of the instructions in \p VL
119 /// are in the same block or null otherwise.
120 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
121   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
122   if (!I0)
123     return nullptr;
124   BasicBlock *BB = I0->getParent();
125   for (int i = 1, e = VL.size(); i < e; i++) {
126     Instruction *I = dyn_cast<Instruction>(VL[i]);
127     if (!I)
128       return nullptr;
129 
130     if (BB != I->getParent())
131       return nullptr;
132   }
133   return BB;
134 }
135 
136 /// \returns True if all of the values in \p VL are constants.
137 static bool allConstant(ArrayRef<Value *> VL) {
138   for (unsigned i = 0, e = VL.size(); i < e; ++i)
139     if (!isa<Constant>(VL[i]))
140       return false;
141   return true;
142 }
143 
144 /// \returns True if all of the values in \p VL are identical.
145 static bool isSplat(ArrayRef<Value *> VL) {
146   for (unsigned i = 1, e = VL.size(); i < e; ++i)
147     if (VL[i] != VL[0])
148       return false;
149   return true;
150 }
151 
152 ///\returns Opcode that can be clubbed with \p Op to create an alternate
153 /// sequence which can later be merged as a ShuffleVector instruction.
154 static unsigned getAltOpcode(unsigned Op) {
155   switch (Op) {
156   case Instruction::FAdd:
157     return Instruction::FSub;
158   case Instruction::FSub:
159     return Instruction::FAdd;
160   case Instruction::Add:
161     return Instruction::Sub;
162   case Instruction::Sub:
163     return Instruction::Add;
164   default:
165     return 0;
166   }
167 }
168 
169 ///\returns bool representing if Opcode \p Op can be part
170 /// of an alternate sequence which can later be merged as
171 /// a ShuffleVector instruction.
172 static bool canCombineAsAltInst(unsigned Op) {
173   return Op == Instruction::FAdd || Op == Instruction::FSub ||
174          Op == Instruction::Sub || Op == Instruction::Add;
175 }
176 
177 /// \returns ShuffleVector instruction if instructions in \p VL have
178 ///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
179 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
180 static unsigned isAltInst(ArrayRef<Value *> VL) {
181   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
182   unsigned Opcode = I0->getOpcode();
183   unsigned AltOpcode = getAltOpcode(Opcode);
184   for (int i = 1, e = VL.size(); i < e; i++) {
185     Instruction *I = dyn_cast<Instruction>(VL[i]);
186     if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
187       return 0;
188   }
189   return Instruction::ShuffleVector;
190 }
191 
192 /// \returns The opcode if all of the Instructions in \p VL have the same
193 /// opcode, or zero.
194 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
195   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
196   if (!I0)
197     return 0;
198   unsigned Opcode = I0->getOpcode();
199   for (int i = 1, e = VL.size(); i < e; i++) {
200     Instruction *I = dyn_cast<Instruction>(VL[i]);
201     if (!I || Opcode != I->getOpcode()) {
202       if (canCombineAsAltInst(Opcode) && i == 1)
203         return isAltInst(VL);
204       return 0;
205     }
206   }
207   return Opcode;
208 }
209 
210 /// Get the intersection (logical and) of all of the potential IR flags
211 /// of each scalar operation (VL) that will be converted into a vector (I).
212 /// Flag set: NSW, NUW, exact, and all of fast-math.
213 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
214   if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
215     if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
216       // Intersection is initialized to the 0th scalar,
217       // so start counting from index '1'.
218       for (int i = 1, e = VL.size(); i < e; ++i) {
219         if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
220           Intersection->andIRFlags(Scalar);
221       }
222       VecOp->copyIRFlags(Intersection);
223     }
224   }
225 }
226 
227 /// \returns \p I after propagating metadata from \p VL.
228 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
229   Instruction *I0 = cast<Instruction>(VL[0]);
230   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
231   I0->getAllMetadataOtherThanDebugLoc(Metadata);
232 
233   for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
234     unsigned Kind = Metadata[i].first;
235     MDNode *MD = Metadata[i].second;
236 
237     for (int i = 1, e = VL.size(); MD && i != e; i++) {
238       Instruction *I = cast<Instruction>(VL[i]);
239       MDNode *IMD = I->getMetadata(Kind);
240 
241       switch (Kind) {
242       default:
243         MD = nullptr; // Remove unknown metadata
244         break;
245       case LLVMContext::MD_tbaa:
246         MD = MDNode::getMostGenericTBAA(MD, IMD);
247         break;
248       case LLVMContext::MD_alias_scope:
249         MD = MDNode::getMostGenericAliasScope(MD, IMD);
250         break;
251       case LLVMContext::MD_noalias:
252         MD = MDNode::intersect(MD, IMD);
253         break;
254       case LLVMContext::MD_fpmath:
255         MD = MDNode::getMostGenericFPMath(MD, IMD);
256         break;
257       case LLVMContext::MD_nontemporal:
258         MD = MDNode::intersect(MD, IMD);
259         break;
260       }
261     }
262     I->setMetadata(Kind, MD);
263   }
264   return I;
265 }
266 
267 /// \returns The type that all of the values in \p VL have or null if there
268 /// are different types.
269 static Type* getSameType(ArrayRef<Value *> VL) {
270   Type *Ty = VL[0]->getType();
271   for (int i = 1, e = VL.size(); i < e; i++)
272     if (VL[i]->getType() != Ty)
273       return nullptr;
274 
275   return Ty;
276 }
277 
278 /// \returns True if the ExtractElement instructions in VL can be vectorized
279 /// to use the original vector.
280 static bool CanReuseExtract(ArrayRef<Value *> VL) {
281   assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
282   // Check if all of the extracts come from the same vector and from the
283   // correct offset.
284   Value *VL0 = VL[0];
285   ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
286   Value *Vec = E0->getOperand(0);
287 
288   // We have to extract from the same vector type.
289   unsigned NElts = Vec->getType()->getVectorNumElements();
290 
291   if (NElts != VL.size())
292     return false;
293 
294   // Check that all of the indices extract from the correct offset.
295   ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
296   if (!CI || CI->getZExtValue())
297     return false;
298 
299   for (unsigned i = 1, e = VL.size(); i < e; ++i) {
300     ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
301     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
302 
303     if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
304       return false;
305   }
306 
307   return true;
308 }
309 
310 /// \returns True if in-tree use also needs extract. This refers to
311 /// possible scalar operand in vectorized instruction.
312 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
313                                     TargetLibraryInfo *TLI) {
314 
315   unsigned Opcode = UserInst->getOpcode();
316   switch (Opcode) {
317   case Instruction::Load: {
318     LoadInst *LI = cast<LoadInst>(UserInst);
319     return (LI->getPointerOperand() == Scalar);
320   }
321   case Instruction::Store: {
322     StoreInst *SI = cast<StoreInst>(UserInst);
323     return (SI->getPointerOperand() == Scalar);
324   }
325   case Instruction::Call: {
326     CallInst *CI = cast<CallInst>(UserInst);
327     Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
328     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
329       return (CI->getArgOperand(1) == Scalar);
330     }
331   }
332   default:
333     return false;
334   }
335 }
336 
337 /// \returns the AA location that is being access by the instruction.
338 static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
339   if (StoreInst *SI = dyn_cast<StoreInst>(I))
340     return MemoryLocation::get(SI);
341   if (LoadInst *LI = dyn_cast<LoadInst>(I))
342     return MemoryLocation::get(LI);
343   return MemoryLocation();
344 }
345 
346 /// \returns True if the instruction is not a volatile or atomic load/store.
347 static bool isSimple(Instruction *I) {
348   if (LoadInst *LI = dyn_cast<LoadInst>(I))
349     return LI->isSimple();
350   if (StoreInst *SI = dyn_cast<StoreInst>(I))
351     return SI->isSimple();
352   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
353     return !MI->isVolatile();
354   return true;
355 }
356 
357 /// Bottom Up SLP Vectorizer.
358 class BoUpSLP {
359 public:
360   typedef SmallVector<Value *, 8> ValueList;
361   typedef SmallVector<Instruction *, 16> InstrList;
362   typedef SmallPtrSet<Value *, 16> ValueSet;
363   typedef SmallVector<StoreInst *, 8> StoreList;
364 
365   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
366           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
367           DominatorTree *Dt, AssumptionCache *AC)
368       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
369         SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
370         Builder(Se->getContext()) {
371     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
372   }
373 
374   /// \brief Vectorize the tree that starts with the elements in \p VL.
375   /// Returns the vectorized root.
376   Value *vectorizeTree();
377 
378   /// \returns the cost incurred by unwanted spills and fills, caused by
379   /// holding live values over call sites.
380   int getSpillCost();
381 
382   /// \returns the vectorization cost of the subtree that starts at \p VL.
383   /// A negative number means that this is profitable.
384   int getTreeCost();
385 
386   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
387   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
388   void buildTree(ArrayRef<Value *> Roots,
389                  ArrayRef<Value *> UserIgnoreLst = None);
390 
391   /// Clear the internal data structures that are created by 'buildTree'.
392   void deleteTree() {
393     VectorizableTree.clear();
394     ScalarToTreeEntry.clear();
395     MustGather.clear();
396     ExternalUses.clear();
397     NumLoadsWantToKeepOrder = 0;
398     NumLoadsWantToChangeOrder = 0;
399     for (auto &Iter : BlocksSchedules) {
400       BlockScheduling *BS = Iter.second.get();
401       BS->clear();
402     }
403   }
404 
405   /// \brief Perform LICM and CSE on the newly generated gather sequences.
406   void optimizeGatherSequence();
407 
408   /// \returns true if it is beneficial to reverse the vector order.
409   bool shouldReorder() const {
410     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
411   }
412 
413   /// \return The vector element size in bits to use when vectorizing the
414   /// expression tree ending at \p V. If V is a store, the size is the width of
415   /// the stored value. Otherwise, the size is the width of the largest loaded
416   /// value reaching V. This method is used by the vectorizer to calculate
417   /// vectorization factors.
418   unsigned getVectorElementSize(Value *V);
419 
420 private:
421   struct TreeEntry;
422 
423   /// \returns the cost of the vectorizable entry.
424   int getEntryCost(TreeEntry *E);
425 
426   /// This is the recursive part of buildTree.
427   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
428 
429   /// Vectorize a single entry in the tree.
430   Value *vectorizeTree(TreeEntry *E);
431 
432   /// Vectorize a single entry in the tree, starting in \p VL.
433   Value *vectorizeTree(ArrayRef<Value *> VL);
434 
435   /// \returns the pointer to the vectorized value if \p VL is already
436   /// vectorized, or NULL. They may happen in cycles.
437   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
438 
439   /// \returns the scalarization cost for this type. Scalarization in this
440   /// context means the creation of vectors from a group of scalars.
441   int getGatherCost(Type *Ty);
442 
443   /// \returns the scalarization cost for this list of values. Assuming that
444   /// this subtree gets vectorized, we may need to extract the values from the
445   /// roots. This method calculates the cost of extracting the values.
446   int getGatherCost(ArrayRef<Value *> VL);
447 
448   /// \brief Set the Builder insert point to one after the last instruction in
449   /// the bundle
450   void setInsertPointAfterBundle(ArrayRef<Value *> VL);
451 
452   /// \returns a vector from a collection of scalars in \p VL.
453   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
454 
455   /// \returns whether the VectorizableTree is fully vectorizable and will
456   /// be beneficial even the tree height is tiny.
457   bool isFullyVectorizableTinyTree();
458 
459   /// \reorder commutative operands in alt shuffle if they result in
460   ///  vectorized code.
461   void reorderAltShuffleOperands(ArrayRef<Value *> VL,
462                                  SmallVectorImpl<Value *> &Left,
463                                  SmallVectorImpl<Value *> &Right);
464   /// \reorder commutative operands to get better probability of
465   /// generating vectorized code.
466   void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
467                                       SmallVectorImpl<Value *> &Left,
468                                       SmallVectorImpl<Value *> &Right);
469   struct TreeEntry {
470     TreeEntry() : Scalars(), VectorizedValue(nullptr),
471     NeedToGather(0) {}
472 
473     /// \returns true if the scalars in VL are equal to this entry.
474     bool isSame(ArrayRef<Value *> VL) const {
475       assert(VL.size() == Scalars.size() && "Invalid size");
476       return std::equal(VL.begin(), VL.end(), Scalars.begin());
477     }
478 
479     /// A vector of scalars.
480     ValueList Scalars;
481 
482     /// The Scalars are vectorized into this value. It is initialized to Null.
483     Value *VectorizedValue;
484 
485     /// Do we need to gather this sequence ?
486     bool NeedToGather;
487   };
488 
489   /// Create a new VectorizableTree entry.
490   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
491     VectorizableTree.emplace_back();
492     int idx = VectorizableTree.size() - 1;
493     TreeEntry *Last = &VectorizableTree[idx];
494     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
495     Last->NeedToGather = !Vectorized;
496     if (Vectorized) {
497       for (int i = 0, e = VL.size(); i != e; ++i) {
498         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
499         ScalarToTreeEntry[VL[i]] = idx;
500       }
501     } else {
502       MustGather.insert(VL.begin(), VL.end());
503     }
504     return Last;
505   }
506 
507   /// -- Vectorization State --
508   /// Holds all of the tree entries.
509   std::vector<TreeEntry> VectorizableTree;
510 
511   /// Maps a specific scalar to its tree entry.
512   SmallDenseMap<Value*, int> ScalarToTreeEntry;
513 
514   /// A list of scalars that we found that we need to keep as scalars.
515   ValueSet MustGather;
516 
517   /// This POD struct describes one external user in the vectorized tree.
518   struct ExternalUser {
519     ExternalUser (Value *S, llvm::User *U, int L) :
520       Scalar(S), User(U), Lane(L){}
521     // Which scalar in our function.
522     Value *Scalar;
523     // Which user that uses the scalar.
524     llvm::User *User;
525     // Which lane does the scalar belong to.
526     int Lane;
527   };
528   typedef SmallVector<ExternalUser, 16> UserList;
529 
530   /// Checks if two instructions may access the same memory.
531   ///
532   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
533   /// is invariant in the calling loop.
534   bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
535                  Instruction *Inst2) {
536 
537     // First check if the result is already in the cache.
538     AliasCacheKey key = std::make_pair(Inst1, Inst2);
539     Optional<bool> &result = AliasCache[key];
540     if (result.hasValue()) {
541       return result.getValue();
542     }
543     MemoryLocation Loc2 = getLocation(Inst2, AA);
544     bool aliased = true;
545     if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
546       // Do the alias check.
547       aliased = AA->alias(Loc1, Loc2);
548     }
549     // Store the result in the cache.
550     result = aliased;
551     return aliased;
552   }
553 
554   typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
555 
556   /// Cache for alias results.
557   /// TODO: consider moving this to the AliasAnalysis itself.
558   DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
559 
560   /// Removes an instruction from its block and eventually deletes it.
561   /// It's like Instruction::eraseFromParent() except that the actual deletion
562   /// is delayed until BoUpSLP is destructed.
563   /// This is required to ensure that there are no incorrect collisions in the
564   /// AliasCache, which can happen if a new instruction is allocated at the
565   /// same address as a previously deleted instruction.
566   void eraseInstruction(Instruction *I) {
567     I->removeFromParent();
568     I->dropAllReferences();
569     DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
570   }
571 
572   /// Temporary store for deleted instructions. Instructions will be deleted
573   /// eventually when the BoUpSLP is destructed.
574   SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
575 
576   /// A list of values that need to extracted out of the tree.
577   /// This list holds pairs of (Internal Scalar : External User).
578   UserList ExternalUses;
579 
580   /// Values used only by @llvm.assume calls.
581   SmallPtrSet<const Value *, 32> EphValues;
582 
583   /// Holds all of the instructions that we gathered.
584   SetVector<Instruction *> GatherSeq;
585   /// A list of blocks that we are going to CSE.
586   SetVector<BasicBlock *> CSEBlocks;
587 
588   /// Contains all scheduling relevant data for an instruction.
589   /// A ScheduleData either represents a single instruction or a member of an
590   /// instruction bundle (= a group of instructions which is combined into a
591   /// vector instruction).
592   struct ScheduleData {
593 
594     // The initial value for the dependency counters. It means that the
595     // dependencies are not calculated yet.
596     enum { InvalidDeps = -1 };
597 
598     ScheduleData()
599         : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
600           NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
601           Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
602           UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
603 
604     void init(int BlockSchedulingRegionID) {
605       FirstInBundle = this;
606       NextInBundle = nullptr;
607       NextLoadStore = nullptr;
608       IsScheduled = false;
609       SchedulingRegionID = BlockSchedulingRegionID;
610       UnscheduledDepsInBundle = UnscheduledDeps;
611       clearDependencies();
612     }
613 
614     /// Returns true if the dependency information has been calculated.
615     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
616 
617     /// Returns true for single instructions and for bundle representatives
618     /// (= the head of a bundle).
619     bool isSchedulingEntity() const { return FirstInBundle == this; }
620 
621     /// Returns true if it represents an instruction bundle and not only a
622     /// single instruction.
623     bool isPartOfBundle() const {
624       return NextInBundle != nullptr || FirstInBundle != this;
625     }
626 
627     /// Returns true if it is ready for scheduling, i.e. it has no more
628     /// unscheduled depending instructions/bundles.
629     bool isReady() const {
630       assert(isSchedulingEntity() &&
631              "can't consider non-scheduling entity for ready list");
632       return UnscheduledDepsInBundle == 0 && !IsScheduled;
633     }
634 
635     /// Modifies the number of unscheduled dependencies, also updating it for
636     /// the whole bundle.
637     int incrementUnscheduledDeps(int Incr) {
638       UnscheduledDeps += Incr;
639       return FirstInBundle->UnscheduledDepsInBundle += Incr;
640     }
641 
642     /// Sets the number of unscheduled dependencies to the number of
643     /// dependencies.
644     void resetUnscheduledDeps() {
645       incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
646     }
647 
648     /// Clears all dependency information.
649     void clearDependencies() {
650       Dependencies = InvalidDeps;
651       resetUnscheduledDeps();
652       MemoryDependencies.clear();
653     }
654 
655     void dump(raw_ostream &os) const {
656       if (!isSchedulingEntity()) {
657         os << "/ " << *Inst;
658       } else if (NextInBundle) {
659         os << '[' << *Inst;
660         ScheduleData *SD = NextInBundle;
661         while (SD) {
662           os << ';' << *SD->Inst;
663           SD = SD->NextInBundle;
664         }
665         os << ']';
666       } else {
667         os << *Inst;
668       }
669     }
670 
671     Instruction *Inst;
672 
673     /// Points to the head in an instruction bundle (and always to this for
674     /// single instructions).
675     ScheduleData *FirstInBundle;
676 
677     /// Single linked list of all instructions in a bundle. Null if it is a
678     /// single instruction.
679     ScheduleData *NextInBundle;
680 
681     /// Single linked list of all memory instructions (e.g. load, store, call)
682     /// in the block - until the end of the scheduling region.
683     ScheduleData *NextLoadStore;
684 
685     /// The dependent memory instructions.
686     /// This list is derived on demand in calculateDependencies().
687     SmallVector<ScheduleData *, 4> MemoryDependencies;
688 
689     /// This ScheduleData is in the current scheduling region if this matches
690     /// the current SchedulingRegionID of BlockScheduling.
691     int SchedulingRegionID;
692 
693     /// Used for getting a "good" final ordering of instructions.
694     int SchedulingPriority;
695 
696     /// The number of dependencies. Constitutes of the number of users of the
697     /// instruction plus the number of dependent memory instructions (if any).
698     /// This value is calculated on demand.
699     /// If InvalidDeps, the number of dependencies is not calculated yet.
700     ///
701     int Dependencies;
702 
703     /// The number of dependencies minus the number of dependencies of scheduled
704     /// instructions. As soon as this is zero, the instruction/bundle gets ready
705     /// for scheduling.
706     /// Note that this is negative as long as Dependencies is not calculated.
707     int UnscheduledDeps;
708 
709     /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
710     /// single instructions.
711     int UnscheduledDepsInBundle;
712 
713     /// True if this instruction is scheduled (or considered as scheduled in the
714     /// dry-run).
715     bool IsScheduled;
716   };
717 
718 #ifndef NDEBUG
719   friend raw_ostream &operator<<(raw_ostream &os,
720                                  const BoUpSLP::ScheduleData &SD);
721 #endif
722 
723   /// Contains all scheduling data for a basic block.
724   ///
725   struct BlockScheduling {
726 
727     BlockScheduling(BasicBlock *BB)
728         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
729           ScheduleStart(nullptr), ScheduleEnd(nullptr),
730           FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
731           ScheduleRegionSize(0),
732           ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
733           // Make sure that the initial SchedulingRegionID is greater than the
734           // initial SchedulingRegionID in ScheduleData (which is 0).
735           SchedulingRegionID(1) {}
736 
737     void clear() {
738       ReadyInsts.clear();
739       ScheduleStart = nullptr;
740       ScheduleEnd = nullptr;
741       FirstLoadStoreInRegion = nullptr;
742       LastLoadStoreInRegion = nullptr;
743 
744       // Reduce the maximum schedule region size by the size of the
745       // previous scheduling run.
746       ScheduleRegionSizeLimit -= ScheduleRegionSize;
747       if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
748         ScheduleRegionSizeLimit = MinScheduleRegionSize;
749       ScheduleRegionSize = 0;
750 
751       // Make a new scheduling region, i.e. all existing ScheduleData is not
752       // in the new region yet.
753       ++SchedulingRegionID;
754     }
755 
756     ScheduleData *getScheduleData(Value *V) {
757       ScheduleData *SD = ScheduleDataMap[V];
758       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
759         return SD;
760       return nullptr;
761     }
762 
763     bool isInSchedulingRegion(ScheduleData *SD) {
764       return SD->SchedulingRegionID == SchedulingRegionID;
765     }
766 
767     /// Marks an instruction as scheduled and puts all dependent ready
768     /// instructions into the ready-list.
769     template <typename ReadyListType>
770     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
771       SD->IsScheduled = true;
772       DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
773 
774       ScheduleData *BundleMember = SD;
775       while (BundleMember) {
776         // Handle the def-use chain dependencies.
777         for (Use &U : BundleMember->Inst->operands()) {
778           ScheduleData *OpDef = getScheduleData(U.get());
779           if (OpDef && OpDef->hasValidDependencies() &&
780               OpDef->incrementUnscheduledDeps(-1) == 0) {
781             // There are no more unscheduled dependencies after decrementing,
782             // so we can put the dependent instruction into the ready list.
783             ScheduleData *DepBundle = OpDef->FirstInBundle;
784             assert(!DepBundle->IsScheduled &&
785                    "already scheduled bundle gets ready");
786             ReadyList.insert(DepBundle);
787             DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n");
788           }
789         }
790         // Handle the memory dependencies.
791         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
792           if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
793             // There are no more unscheduled dependencies after decrementing,
794             // so we can put the dependent instruction into the ready list.
795             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
796             assert(!DepBundle->IsScheduled &&
797                    "already scheduled bundle gets ready");
798             ReadyList.insert(DepBundle);
799             DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n");
800           }
801         }
802         BundleMember = BundleMember->NextInBundle;
803       }
804     }
805 
806     /// Put all instructions into the ReadyList which are ready for scheduling.
807     template <typename ReadyListType>
808     void initialFillReadyList(ReadyListType &ReadyList) {
809       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
810         ScheduleData *SD = getScheduleData(I);
811         if (SD->isSchedulingEntity() && SD->isReady()) {
812           ReadyList.insert(SD);
813           DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
814         }
815       }
816     }
817 
818     /// Checks if a bundle of instructions can be scheduled, i.e. has no
819     /// cyclic dependencies. This is only a dry-run, no instructions are
820     /// actually moved at this stage.
821     bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
822 
823     /// Un-bundles a group of instructions.
824     void cancelScheduling(ArrayRef<Value *> VL);
825 
826     /// Extends the scheduling region so that V is inside the region.
827     /// \returns true if the region size is within the limit.
828     bool extendSchedulingRegion(Value *V);
829 
830     /// Initialize the ScheduleData structures for new instructions in the
831     /// scheduling region.
832     void initScheduleData(Instruction *FromI, Instruction *ToI,
833                           ScheduleData *PrevLoadStore,
834                           ScheduleData *NextLoadStore);
835 
836     /// Updates the dependency information of a bundle and of all instructions/
837     /// bundles which depend on the original bundle.
838     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
839                                BoUpSLP *SLP);
840 
841     /// Sets all instruction in the scheduling region to un-scheduled.
842     void resetSchedule();
843 
844     BasicBlock *BB;
845 
846     /// Simple memory allocation for ScheduleData.
847     std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
848 
849     /// The size of a ScheduleData array in ScheduleDataChunks.
850     int ChunkSize;
851 
852     /// The allocator position in the current chunk, which is the last entry
853     /// of ScheduleDataChunks.
854     int ChunkPos;
855 
856     /// Attaches ScheduleData to Instruction.
857     /// Note that the mapping survives during all vectorization iterations, i.e.
858     /// ScheduleData structures are recycled.
859     DenseMap<Value *, ScheduleData *> ScheduleDataMap;
860 
861     struct ReadyList : SmallVector<ScheduleData *, 8> {
862       void insert(ScheduleData *SD) { push_back(SD); }
863     };
864 
865     /// The ready-list for scheduling (only used for the dry-run).
866     ReadyList ReadyInsts;
867 
868     /// The first instruction of the scheduling region.
869     Instruction *ScheduleStart;
870 
871     /// The first instruction _after_ the scheduling region.
872     Instruction *ScheduleEnd;
873 
874     /// The first memory accessing instruction in the scheduling region
875     /// (can be null).
876     ScheduleData *FirstLoadStoreInRegion;
877 
878     /// The last memory accessing instruction in the scheduling region
879     /// (can be null).
880     ScheduleData *LastLoadStoreInRegion;
881 
882     /// The current size of the scheduling region.
883     int ScheduleRegionSize;
884 
885     /// The maximum size allowed for the scheduling region.
886     int ScheduleRegionSizeLimit;
887 
888     /// The ID of the scheduling region. For a new vectorization iteration this
889     /// is incremented which "removes" all ScheduleData from the region.
890     int SchedulingRegionID;
891   };
892 
893   /// Attaches the BlockScheduling structures to basic blocks.
894   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
895 
896   /// Performs the "real" scheduling. Done before vectorization is actually
897   /// performed in a basic block.
898   void scheduleBlock(BlockScheduling *BS);
899 
900   /// List of users to ignore during scheduling and that don't need extracting.
901   ArrayRef<Value *> UserIgnoreList;
902 
903   // Number of load-bundles, which contain consecutive loads.
904   int NumLoadsWantToKeepOrder;
905 
906   // Number of load-bundles of size 2, which are consecutive loads if reversed.
907   int NumLoadsWantToChangeOrder;
908 
909   // Analysis and block reference.
910   Function *F;
911   ScalarEvolution *SE;
912   TargetTransformInfo *TTI;
913   TargetLibraryInfo *TLI;
914   AliasAnalysis *AA;
915   LoopInfo *LI;
916   DominatorTree *DT;
917   /// Instruction builder to construct the vectorized tree.
918   IRBuilder<> Builder;
919 };
920 
921 #ifndef NDEBUG
922 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
923   SD.dump(os);
924   return os;
925 }
926 #endif
927 
928 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
929                         ArrayRef<Value *> UserIgnoreLst) {
930   deleteTree();
931   UserIgnoreList = UserIgnoreLst;
932   if (!getSameType(Roots))
933     return;
934   buildTree_rec(Roots, 0);
935 
936   // Collect the values that we need to extract from the tree.
937   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
938     TreeEntry *Entry = &VectorizableTree[EIdx];
939 
940     // For each lane:
941     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
942       Value *Scalar = Entry->Scalars[Lane];
943 
944       // No need to handle users of gathered values.
945       if (Entry->NeedToGather)
946         continue;
947 
948       for (User *U : Scalar->users()) {
949         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
950 
951         Instruction *UserInst = dyn_cast<Instruction>(U);
952         if (!UserInst)
953           continue;
954 
955         // Skip in-tree scalars that become vectors
956         if (ScalarToTreeEntry.count(U)) {
957           int Idx = ScalarToTreeEntry[U];
958           TreeEntry *UseEntry = &VectorizableTree[Idx];
959           Value *UseScalar = UseEntry->Scalars[0];
960           // Some in-tree scalars will remain as scalar in vectorized
961           // instructions. If that is the case, the one in Lane 0 will
962           // be used.
963           if (UseScalar != U ||
964               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
965             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
966                          << ".\n");
967             assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
968             continue;
969           }
970         }
971 
972         // Ignore users in the user ignore list.
973         if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
974             UserIgnoreList.end())
975           continue;
976 
977         DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
978               Lane << " from " << *Scalar << ".\n");
979         ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
980       }
981     }
982   }
983 }
984 
985 
986 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
987   bool SameTy = getSameType(VL); (void)SameTy;
988   bool isAltShuffle = false;
989   assert(SameTy && "Invalid types!");
990 
991   if (Depth == RecursionMaxDepth) {
992     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
993     newTreeEntry(VL, false);
994     return;
995   }
996 
997   // Don't handle vectors.
998   if (VL[0]->getType()->isVectorTy()) {
999     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
1000     newTreeEntry(VL, false);
1001     return;
1002   }
1003 
1004   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1005     if (SI->getValueOperand()->getType()->isVectorTy()) {
1006       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
1007       newTreeEntry(VL, false);
1008       return;
1009     }
1010   unsigned Opcode = getSameOpcode(VL);
1011 
1012   // Check that this shuffle vector refers to the alternate
1013   // sequence of opcodes.
1014   if (Opcode == Instruction::ShuffleVector) {
1015     Instruction *I0 = dyn_cast<Instruction>(VL[0]);
1016     unsigned Op = I0->getOpcode();
1017     if (Op != Instruction::ShuffleVector)
1018       isAltShuffle = true;
1019   }
1020 
1021   // If all of the operands are identical or constant we have a simple solution.
1022   if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
1023     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
1024     newTreeEntry(VL, false);
1025     return;
1026   }
1027 
1028   // We now know that this is a vector of instructions of the same type from
1029   // the same block.
1030 
1031   // Don't vectorize ephemeral values.
1032   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1033     if (EphValues.count(VL[i])) {
1034       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1035             ") is ephemeral.\n");
1036       newTreeEntry(VL, false);
1037       return;
1038     }
1039   }
1040 
1041   // Check if this is a duplicate of another entry.
1042   if (ScalarToTreeEntry.count(VL[0])) {
1043     int Idx = ScalarToTreeEntry[VL[0]];
1044     TreeEntry *E = &VectorizableTree[Idx];
1045     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1046       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1047       if (E->Scalars[i] != VL[i]) {
1048         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1049         newTreeEntry(VL, false);
1050         return;
1051       }
1052     }
1053     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1054     return;
1055   }
1056 
1057   // Check that none of the instructions in the bundle are already in the tree.
1058   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1059     if (ScalarToTreeEntry.count(VL[i])) {
1060       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1061             ") is already in tree.\n");
1062       newTreeEntry(VL, false);
1063       return;
1064     }
1065   }
1066 
1067   // If any of the scalars is marked as a value that needs to stay scalar then
1068   // we need to gather the scalars.
1069   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1070     if (MustGather.count(VL[i])) {
1071       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
1072       newTreeEntry(VL, false);
1073       return;
1074     }
1075   }
1076 
1077   // Check that all of the users of the scalars that we want to vectorize are
1078   // schedulable.
1079   Instruction *VL0 = cast<Instruction>(VL[0]);
1080   BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1081 
1082   if (!DT->isReachableFromEntry(BB)) {
1083     // Don't go into unreachable blocks. They may contain instructions with
1084     // dependency cycles which confuse the final scheduling.
1085     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1086     newTreeEntry(VL, false);
1087     return;
1088   }
1089 
1090   // Check that every instructions appears once in this bundle.
1091   for (unsigned i = 0, e = VL.size(); i < e; ++i)
1092     for (unsigned j = i+1; j < e; ++j)
1093       if (VL[i] == VL[j]) {
1094         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1095         newTreeEntry(VL, false);
1096         return;
1097       }
1098 
1099   auto &BSRef = BlocksSchedules[BB];
1100   if (!BSRef) {
1101     BSRef = llvm::make_unique<BlockScheduling>(BB);
1102   }
1103   BlockScheduling &BS = *BSRef.get();
1104 
1105   if (!BS.tryScheduleBundle(VL, this)) {
1106     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1107     assert((!BS.getScheduleData(VL[0]) ||
1108             !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
1109            "tryScheduleBundle should cancelScheduling on failure");
1110     newTreeEntry(VL, false);
1111     return;
1112   }
1113   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1114 
1115   switch (Opcode) {
1116     case Instruction::PHI: {
1117       PHINode *PH = dyn_cast<PHINode>(VL0);
1118 
1119       // Check for terminator values (e.g. invoke).
1120       for (unsigned j = 0; j < VL.size(); ++j)
1121         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1122           TerminatorInst *Term = dyn_cast<TerminatorInst>(
1123               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1124           if (Term) {
1125             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1126             BS.cancelScheduling(VL);
1127             newTreeEntry(VL, false);
1128             return;
1129           }
1130         }
1131 
1132       newTreeEntry(VL, true);
1133       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1134 
1135       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1136         ValueList Operands;
1137         // Prepare the operand vector.
1138         for (unsigned j = 0; j < VL.size(); ++j)
1139           Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1140               PH->getIncomingBlock(i)));
1141 
1142         buildTree_rec(Operands, Depth + 1);
1143       }
1144       return;
1145     }
1146     case Instruction::ExtractElement: {
1147       bool Reuse = CanReuseExtract(VL);
1148       if (Reuse) {
1149         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1150       } else {
1151         BS.cancelScheduling(VL);
1152       }
1153       newTreeEntry(VL, Reuse);
1154       return;
1155     }
1156     case Instruction::Load: {
1157       // Check that a vectorized load would load the same memory as a scalar
1158       // load.
1159       // For example we don't want vectorize loads that are smaller than 8 bit.
1160       // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
1161       // loading/storing it as an i8 struct. If we vectorize loads/stores from
1162       // such a struct we read/write packed bits disagreeing with the
1163       // unvectorized version.
1164       const DataLayout &DL = F->getParent()->getDataLayout();
1165       Type *ScalarTy = VL[0]->getType();
1166 
1167       if (DL.getTypeSizeInBits(ScalarTy) !=
1168           DL.getTypeAllocSizeInBits(ScalarTy)) {
1169         BS.cancelScheduling(VL);
1170         newTreeEntry(VL, false);
1171         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
1172         return;
1173       }
1174       // Check if the loads are consecutive or of we need to swizzle them.
1175       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1176         LoadInst *L = cast<LoadInst>(VL[i]);
1177         if (!L->isSimple()) {
1178           BS.cancelScheduling(VL);
1179           newTreeEntry(VL, false);
1180           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1181           return;
1182         }
1183 
1184         if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, *SE)) {
1185           if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL, *SE)) {
1186             ++NumLoadsWantToChangeOrder;
1187           }
1188           BS.cancelScheduling(VL);
1189           newTreeEntry(VL, false);
1190           DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1191           return;
1192         }
1193       }
1194       ++NumLoadsWantToKeepOrder;
1195       newTreeEntry(VL, true);
1196       DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1197       return;
1198     }
1199     case Instruction::ZExt:
1200     case Instruction::SExt:
1201     case Instruction::FPToUI:
1202     case Instruction::FPToSI:
1203     case Instruction::FPExt:
1204     case Instruction::PtrToInt:
1205     case Instruction::IntToPtr:
1206     case Instruction::SIToFP:
1207     case Instruction::UIToFP:
1208     case Instruction::Trunc:
1209     case Instruction::FPTrunc:
1210     case Instruction::BitCast: {
1211       Type *SrcTy = VL0->getOperand(0)->getType();
1212       for (unsigned i = 0; i < VL.size(); ++i) {
1213         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1214         if (Ty != SrcTy || !isValidElementType(Ty)) {
1215           BS.cancelScheduling(VL);
1216           newTreeEntry(VL, false);
1217           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1218           return;
1219         }
1220       }
1221       newTreeEntry(VL, true);
1222       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1223 
1224       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1225         ValueList Operands;
1226         // Prepare the operand vector.
1227         for (unsigned j = 0; j < VL.size(); ++j)
1228           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1229 
1230         buildTree_rec(Operands, Depth+1);
1231       }
1232       return;
1233     }
1234     case Instruction::ICmp:
1235     case Instruction::FCmp: {
1236       // Check that all of the compares have the same predicate.
1237       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
1238       Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1239       for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1240         CmpInst *Cmp = cast<CmpInst>(VL[i]);
1241         if (Cmp->getPredicate() != P0 ||
1242             Cmp->getOperand(0)->getType() != ComparedTy) {
1243           BS.cancelScheduling(VL);
1244           newTreeEntry(VL, false);
1245           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1246           return;
1247         }
1248       }
1249 
1250       newTreeEntry(VL, true);
1251       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1252 
1253       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1254         ValueList Operands;
1255         // Prepare the operand vector.
1256         for (unsigned j = 0; j < VL.size(); ++j)
1257           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1258 
1259         buildTree_rec(Operands, Depth+1);
1260       }
1261       return;
1262     }
1263     case Instruction::Select:
1264     case Instruction::Add:
1265     case Instruction::FAdd:
1266     case Instruction::Sub:
1267     case Instruction::FSub:
1268     case Instruction::Mul:
1269     case Instruction::FMul:
1270     case Instruction::UDiv:
1271     case Instruction::SDiv:
1272     case Instruction::FDiv:
1273     case Instruction::URem:
1274     case Instruction::SRem:
1275     case Instruction::FRem:
1276     case Instruction::Shl:
1277     case Instruction::LShr:
1278     case Instruction::AShr:
1279     case Instruction::And:
1280     case Instruction::Or:
1281     case Instruction::Xor: {
1282       newTreeEntry(VL, true);
1283       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1284 
1285       // Sort operands of the instructions so that each side is more likely to
1286       // have the same opcode.
1287       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1288         ValueList Left, Right;
1289         reorderInputsAccordingToOpcode(VL, Left, Right);
1290         buildTree_rec(Left, Depth + 1);
1291         buildTree_rec(Right, Depth + 1);
1292         return;
1293       }
1294 
1295       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1296         ValueList Operands;
1297         // Prepare the operand vector.
1298         for (unsigned j = 0; j < VL.size(); ++j)
1299           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1300 
1301         buildTree_rec(Operands, Depth+1);
1302       }
1303       return;
1304     }
1305     case Instruction::GetElementPtr: {
1306       // We don't combine GEPs with complicated (nested) indexing.
1307       for (unsigned j = 0; j < VL.size(); ++j) {
1308         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1309           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1310           BS.cancelScheduling(VL);
1311           newTreeEntry(VL, false);
1312           return;
1313         }
1314       }
1315 
1316       // We can't combine several GEPs into one vector if they operate on
1317       // different types.
1318       Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1319       for (unsigned j = 0; j < VL.size(); ++j) {
1320         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1321         if (Ty0 != CurTy) {
1322           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1323           BS.cancelScheduling(VL);
1324           newTreeEntry(VL, false);
1325           return;
1326         }
1327       }
1328 
1329       // We don't combine GEPs with non-constant indexes.
1330       for (unsigned j = 0; j < VL.size(); ++j) {
1331         auto Op = cast<Instruction>(VL[j])->getOperand(1);
1332         if (!isa<ConstantInt>(Op)) {
1333           DEBUG(
1334               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1335           BS.cancelScheduling(VL);
1336           newTreeEntry(VL, false);
1337           return;
1338         }
1339       }
1340 
1341       newTreeEntry(VL, true);
1342       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1343       for (unsigned i = 0, e = 2; i < e; ++i) {
1344         ValueList Operands;
1345         // Prepare the operand vector.
1346         for (unsigned j = 0; j < VL.size(); ++j)
1347           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1348 
1349         buildTree_rec(Operands, Depth + 1);
1350       }
1351       return;
1352     }
1353     case Instruction::Store: {
1354       const DataLayout &DL = F->getParent()->getDataLayout();
1355       // Check if the stores are consecutive or of we need to swizzle them.
1356       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1357         if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, *SE)) {
1358           BS.cancelScheduling(VL);
1359           newTreeEntry(VL, false);
1360           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1361           return;
1362         }
1363 
1364       newTreeEntry(VL, true);
1365       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1366 
1367       ValueList Operands;
1368       for (unsigned j = 0; j < VL.size(); ++j)
1369         Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1370 
1371       buildTree_rec(Operands, Depth + 1);
1372       return;
1373     }
1374     case Instruction::Call: {
1375       // Check if the calls are all to the same vectorizable intrinsic.
1376       CallInst *CI = cast<CallInst>(VL[0]);
1377       // Check if this is an Intrinsic call or something that can be
1378       // represented by an intrinsic call
1379       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1380       if (!isTriviallyVectorizable(ID)) {
1381         BS.cancelScheduling(VL);
1382         newTreeEntry(VL, false);
1383         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1384         return;
1385       }
1386       Function *Int = CI->getCalledFunction();
1387       Value *A1I = nullptr;
1388       if (hasVectorInstrinsicScalarOpd(ID, 1))
1389         A1I = CI->getArgOperand(1);
1390       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1391         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1392         if (!CI2 || CI2->getCalledFunction() != Int ||
1393             getIntrinsicIDForCall(CI2, TLI) != ID) {
1394           BS.cancelScheduling(VL);
1395           newTreeEntry(VL, false);
1396           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1397                        << "\n");
1398           return;
1399         }
1400         // ctlz,cttz and powi are special intrinsics whose second argument
1401         // should be same in order for them to be vectorized.
1402         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1403           Value *A1J = CI2->getArgOperand(1);
1404           if (A1I != A1J) {
1405             BS.cancelScheduling(VL);
1406             newTreeEntry(VL, false);
1407             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1408                          << " argument "<< A1I<<"!=" << A1J
1409                          << "\n");
1410             return;
1411           }
1412         }
1413       }
1414 
1415       newTreeEntry(VL, true);
1416       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1417         ValueList Operands;
1418         // Prepare the operand vector.
1419         for (unsigned j = 0; j < VL.size(); ++j) {
1420           CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
1421           Operands.push_back(CI2->getArgOperand(i));
1422         }
1423         buildTree_rec(Operands, Depth + 1);
1424       }
1425       return;
1426     }
1427     case Instruction::ShuffleVector: {
1428       // If this is not an alternate sequence of opcode like add-sub
1429       // then do not vectorize this instruction.
1430       if (!isAltShuffle) {
1431         BS.cancelScheduling(VL);
1432         newTreeEntry(VL, false);
1433         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1434         return;
1435       }
1436       newTreeEntry(VL, true);
1437       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1438 
1439       // Reorder operands if reordering would enable vectorization.
1440       if (isa<BinaryOperator>(VL0)) {
1441         ValueList Left, Right;
1442         reorderAltShuffleOperands(VL, Left, Right);
1443         buildTree_rec(Left, Depth + 1);
1444         buildTree_rec(Right, Depth + 1);
1445         return;
1446       }
1447 
1448       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1449         ValueList Operands;
1450         // Prepare the operand vector.
1451         for (unsigned j = 0; j < VL.size(); ++j)
1452           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1453 
1454         buildTree_rec(Operands, Depth + 1);
1455       }
1456       return;
1457     }
1458     default:
1459       BS.cancelScheduling(VL);
1460       newTreeEntry(VL, false);
1461       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1462       return;
1463   }
1464 }
1465 
1466 int BoUpSLP::getEntryCost(TreeEntry *E) {
1467   ArrayRef<Value*> VL = E->Scalars;
1468 
1469   Type *ScalarTy = VL[0]->getType();
1470   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1471     ScalarTy = SI->getValueOperand()->getType();
1472   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1473 
1474   if (E->NeedToGather) {
1475     if (allConstant(VL))
1476       return 0;
1477     if (isSplat(VL)) {
1478       return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1479     }
1480     return getGatherCost(E->Scalars);
1481   }
1482   unsigned Opcode = getSameOpcode(VL);
1483   assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
1484   Instruction *VL0 = cast<Instruction>(VL[0]);
1485   switch (Opcode) {
1486     case Instruction::PHI: {
1487       return 0;
1488     }
1489     case Instruction::ExtractElement: {
1490       if (CanReuseExtract(VL)) {
1491         int DeadCost = 0;
1492         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1493           ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
1494           if (E->hasOneUse())
1495             // Take credit for instruction that will become dead.
1496             DeadCost +=
1497                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1498         }
1499         return -DeadCost;
1500       }
1501       return getGatherCost(VecTy);
1502     }
1503     case Instruction::ZExt:
1504     case Instruction::SExt:
1505     case Instruction::FPToUI:
1506     case Instruction::FPToSI:
1507     case Instruction::FPExt:
1508     case Instruction::PtrToInt:
1509     case Instruction::IntToPtr:
1510     case Instruction::SIToFP:
1511     case Instruction::UIToFP:
1512     case Instruction::Trunc:
1513     case Instruction::FPTrunc:
1514     case Instruction::BitCast: {
1515       Type *SrcTy = VL0->getOperand(0)->getType();
1516 
1517       // Calculate the cost of this instruction.
1518       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1519                                                          VL0->getType(), SrcTy);
1520 
1521       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1522       int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1523       return VecCost - ScalarCost;
1524     }
1525     case Instruction::FCmp:
1526     case Instruction::ICmp:
1527     case Instruction::Select:
1528     case Instruction::Add:
1529     case Instruction::FAdd:
1530     case Instruction::Sub:
1531     case Instruction::FSub:
1532     case Instruction::Mul:
1533     case Instruction::FMul:
1534     case Instruction::UDiv:
1535     case Instruction::SDiv:
1536     case Instruction::FDiv:
1537     case Instruction::URem:
1538     case Instruction::SRem:
1539     case Instruction::FRem:
1540     case Instruction::Shl:
1541     case Instruction::LShr:
1542     case Instruction::AShr:
1543     case Instruction::And:
1544     case Instruction::Or:
1545     case Instruction::Xor: {
1546       // Calculate the cost of this instruction.
1547       int ScalarCost = 0;
1548       int VecCost = 0;
1549       if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
1550           Opcode == Instruction::Select) {
1551         VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1552         ScalarCost = VecTy->getNumElements() *
1553         TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1554         VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1555       } else {
1556         // Certain instructions can be cheaper to vectorize if they have a
1557         // constant second vector operand.
1558         TargetTransformInfo::OperandValueKind Op1VK =
1559             TargetTransformInfo::OK_AnyValue;
1560         TargetTransformInfo::OperandValueKind Op2VK =
1561             TargetTransformInfo::OK_UniformConstantValue;
1562         TargetTransformInfo::OperandValueProperties Op1VP =
1563             TargetTransformInfo::OP_None;
1564         TargetTransformInfo::OperandValueProperties Op2VP =
1565             TargetTransformInfo::OP_None;
1566 
1567         // If all operands are exactly the same ConstantInt then set the
1568         // operand kind to OK_UniformConstantValue.
1569         // If instead not all operands are constants, then set the operand kind
1570         // to OK_AnyValue. If all operands are constants but not the same,
1571         // then set the operand kind to OK_NonUniformConstantValue.
1572         ConstantInt *CInt = nullptr;
1573         for (unsigned i = 0; i < VL.size(); ++i) {
1574           const Instruction *I = cast<Instruction>(VL[i]);
1575           if (!isa<ConstantInt>(I->getOperand(1))) {
1576             Op2VK = TargetTransformInfo::OK_AnyValue;
1577             break;
1578           }
1579           if (i == 0) {
1580             CInt = cast<ConstantInt>(I->getOperand(1));
1581             continue;
1582           }
1583           if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
1584               CInt != cast<ConstantInt>(I->getOperand(1)))
1585             Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1586         }
1587         // FIXME: Currently cost of model modification for division by
1588         // power of 2 is handled only for X86. Add support for other targets.
1589         if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1590             CInt->getValue().isPowerOf2())
1591           Op2VP = TargetTransformInfo::OP_PowerOf2;
1592 
1593         ScalarCost = VecTy->getNumElements() *
1594                      TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
1595                                                  Op1VP, Op2VP);
1596         VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1597                                               Op1VP, Op2VP);
1598       }
1599       return VecCost - ScalarCost;
1600     }
1601     case Instruction::GetElementPtr: {
1602       TargetTransformInfo::OperandValueKind Op1VK =
1603           TargetTransformInfo::OK_AnyValue;
1604       TargetTransformInfo::OperandValueKind Op2VK =
1605           TargetTransformInfo::OK_UniformConstantValue;
1606 
1607       int ScalarCost =
1608           VecTy->getNumElements() *
1609           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1610       int VecCost =
1611           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1612 
1613       return VecCost - ScalarCost;
1614     }
1615     case Instruction::Load: {
1616       // Cost of wide load - cost of scalar loads.
1617       int ScalarLdCost = VecTy->getNumElements() *
1618       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
1619       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
1620       return VecLdCost - ScalarLdCost;
1621     }
1622     case Instruction::Store: {
1623       // We know that we can merge the stores. Calculate the cost.
1624       int ScalarStCost = VecTy->getNumElements() *
1625       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
1626       int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
1627       return VecStCost - ScalarStCost;
1628     }
1629     case Instruction::Call: {
1630       CallInst *CI = cast<CallInst>(VL0);
1631       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1632 
1633       // Calculate the cost of the scalar and vector calls.
1634       SmallVector<Type*, 4> ScalarTys, VecTys;
1635       for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1636         ScalarTys.push_back(CI->getArgOperand(op)->getType());
1637         VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
1638                                          VecTy->getNumElements()));
1639       }
1640 
1641       int ScalarCallCost = VecTy->getNumElements() *
1642           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1643 
1644       int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1645 
1646       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1647             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
1648             << " for " << *CI << "\n");
1649 
1650       return VecCallCost - ScalarCallCost;
1651     }
1652     case Instruction::ShuffleVector: {
1653       TargetTransformInfo::OperandValueKind Op1VK =
1654           TargetTransformInfo::OK_AnyValue;
1655       TargetTransformInfo::OperandValueKind Op2VK =
1656           TargetTransformInfo::OK_AnyValue;
1657       int ScalarCost = 0;
1658       int VecCost = 0;
1659       for (unsigned i = 0; i < VL.size(); ++i) {
1660         Instruction *I = cast<Instruction>(VL[i]);
1661         if (!I)
1662           break;
1663         ScalarCost +=
1664             TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1665       }
1666       // VecCost is equal to sum of the cost of creating 2 vectors
1667       // and the cost of creating shuffle.
1668       Instruction *I0 = cast<Instruction>(VL[0]);
1669       VecCost =
1670           TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1671       Instruction *I1 = cast<Instruction>(VL[1]);
1672       VecCost +=
1673           TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1674       VecCost +=
1675           TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1676       return VecCost - ScalarCost;
1677     }
1678     default:
1679       llvm_unreachable("Unknown instruction");
1680   }
1681 }
1682 
1683 bool BoUpSLP::isFullyVectorizableTinyTree() {
1684   DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1685         VectorizableTree.size() << " is fully vectorizable .\n");
1686 
1687   // We only handle trees of height 2.
1688   if (VectorizableTree.size() != 2)
1689     return false;
1690 
1691   // Handle splat and all-constants stores.
1692   if (!VectorizableTree[0].NeedToGather &&
1693       (allConstant(VectorizableTree[1].Scalars) ||
1694        isSplat(VectorizableTree[1].Scalars)))
1695     return true;
1696 
1697   // Gathering cost would be too much for tiny trees.
1698   if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1699     return false;
1700 
1701   return true;
1702 }
1703 
1704 int BoUpSLP::getSpillCost() {
1705   // Walk from the bottom of the tree to the top, tracking which values are
1706   // live. When we see a call instruction that is not part of our tree,
1707   // query TTI to see if there is a cost to keeping values live over it
1708   // (for example, if spills and fills are required).
1709   unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1710   int Cost = 0;
1711 
1712   SmallPtrSet<Instruction*, 4> LiveValues;
1713   Instruction *PrevInst = nullptr;
1714 
1715   for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
1716     Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
1717     if (!Inst)
1718       continue;
1719 
1720     if (!PrevInst) {
1721       PrevInst = Inst;
1722       continue;
1723     }
1724 
1725     DEBUG(
1726       dbgs() << "SLP: #LV: " << LiveValues.size();
1727       for (auto *X : LiveValues)
1728         dbgs() << " " << X->getName();
1729       dbgs() << ", Looking at ";
1730       Inst->dump();
1731       );
1732 
1733     // Update LiveValues.
1734     LiveValues.erase(PrevInst);
1735     for (auto &J : PrevInst->operands()) {
1736       if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1737         LiveValues.insert(cast<Instruction>(&*J));
1738     }
1739 
1740     // Now find the sequence of instructions between PrevInst and Inst.
1741     BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
1742         PrevInstIt(PrevInst->getIterator());
1743     --PrevInstIt;
1744     while (InstIt != PrevInstIt) {
1745       if (PrevInstIt == PrevInst->getParent()->rend()) {
1746         PrevInstIt = Inst->getParent()->rbegin();
1747         continue;
1748       }
1749 
1750       if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1751         SmallVector<Type*, 4> V;
1752         for (auto *II : LiveValues)
1753           V.push_back(VectorType::get(II->getType(), BundleWidth));
1754         Cost += TTI->getCostOfKeepingLiveOverCall(V);
1755       }
1756 
1757       ++PrevInstIt;
1758     }
1759 
1760     PrevInst = Inst;
1761   }
1762 
1763   DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
1764   return Cost;
1765 }
1766 
1767 int BoUpSLP::getTreeCost() {
1768   int Cost = 0;
1769   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1770         VectorizableTree.size() << ".\n");
1771 
1772   // We only vectorize tiny trees if it is fully vectorizable.
1773   if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1774     if (VectorizableTree.empty()) {
1775       assert(!ExternalUses.size() && "We should not have any external users");
1776     }
1777     return INT_MAX;
1778   }
1779 
1780   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1781 
1782   for (TreeEntry &TE : VectorizableTree) {
1783     int C = getEntryCost(&TE);
1784     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1785           << TE.Scalars[0] << " .\n");
1786     Cost += C;
1787   }
1788 
1789   SmallSet<Value *, 16> ExtractCostCalculated;
1790   int ExtractCost = 0;
1791   for (ExternalUser &EU : ExternalUses) {
1792     // We only add extract cost once for the same scalar.
1793     if (!ExtractCostCalculated.insert(EU.Scalar).second)
1794       continue;
1795 
1796     // Uses by ephemeral values are free (because the ephemeral value will be
1797     // removed prior to code generation, and so the extraction will be
1798     // removed as well).
1799     if (EphValues.count(EU.User))
1800       continue;
1801 
1802     VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
1803     ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1804                                            EU.Lane);
1805   }
1806 
1807   Cost += getSpillCost();
1808 
1809   DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
1810   return  Cost + ExtractCost;
1811 }
1812 
1813 int BoUpSLP::getGatherCost(Type *Ty) {
1814   int Cost = 0;
1815   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1816     Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1817   return Cost;
1818 }
1819 
1820 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1821   // Find the type of the operands in VL.
1822   Type *ScalarTy = VL[0]->getType();
1823   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1824     ScalarTy = SI->getValueOperand()->getType();
1825   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1826   // Find the cost of inserting/extracting values from the vector.
1827   return getGatherCost(VecTy);
1828 }
1829 
1830 // Reorder commutative operations in alternate shuffle if the resulting vectors
1831 // are consecutive loads. This would allow us to vectorize the tree.
1832 // If we have something like-
1833 // load a[0] - load b[0]
1834 // load b[1] + load a[1]
1835 // load a[2] - load b[2]
1836 // load a[3] + load b[3]
1837 // Reordering the second load b[1]  load a[1] would allow us to vectorize this
1838 // code.
1839 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
1840                                         SmallVectorImpl<Value *> &Left,
1841                                         SmallVectorImpl<Value *> &Right) {
1842   const DataLayout &DL = F->getParent()->getDataLayout();
1843 
1844   // Push left and right operands of binary operation into Left and Right
1845   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1846     Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
1847     Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
1848   }
1849 
1850   // Reorder if we have a commutative operation and consecutive access
1851   // are on either side of the alternate instructions.
1852   for (unsigned j = 0; j < VL.size() - 1; ++j) {
1853     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
1854       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
1855         Instruction *VL1 = cast<Instruction>(VL[j]);
1856         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1857         if (isConsecutiveAccess(L, L1, DL, *SE) && VL1->isCommutative()) {
1858           std::swap(Left[j], Right[j]);
1859           continue;
1860         } else if (isConsecutiveAccess(L, L1, DL, *SE) && VL2->isCommutative()) {
1861           std::swap(Left[j + 1], Right[j + 1]);
1862           continue;
1863         }
1864         // else unchanged
1865       }
1866     }
1867     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
1868       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
1869         Instruction *VL1 = cast<Instruction>(VL[j]);
1870         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
1871         if (isConsecutiveAccess(L, L1, DL, *SE) && VL1->isCommutative()) {
1872           std::swap(Left[j], Right[j]);
1873           continue;
1874         } else if (isConsecutiveAccess(L, L1, DL, *SE) && VL2->isCommutative()) {
1875           std::swap(Left[j + 1], Right[j + 1]);
1876           continue;
1877         }
1878         // else unchanged
1879       }
1880     }
1881   }
1882 }
1883 
1884 // Return true if I should be commuted before adding it's left and right
1885 // operands to the arrays Left and Right.
1886 //
1887 // The vectorizer is trying to either have all elements one side being
1888 // instruction with the same opcode to enable further vectorization, or having
1889 // a splat to lower the vectorizing cost.
1890 static bool shouldReorderOperands(int i, Instruction &I,
1891                                   SmallVectorImpl<Value *> &Left,
1892                                   SmallVectorImpl<Value *> &Right,
1893                                   bool AllSameOpcodeLeft,
1894                                   bool AllSameOpcodeRight, bool SplatLeft,
1895                                   bool SplatRight) {
1896   Value *VLeft = I.getOperand(0);
1897   Value *VRight = I.getOperand(1);
1898   // If we have "SplatRight", try to see if commuting is needed to preserve it.
1899   if (SplatRight) {
1900     if (VRight == Right[i - 1])
1901       // Preserve SplatRight
1902       return false;
1903     if (VLeft == Right[i - 1]) {
1904       // Commuting would preserve SplatRight, but we don't want to break
1905       // SplatLeft either, i.e. preserve the original order if possible.
1906       // (FIXME: why do we care?)
1907       if (SplatLeft && VLeft == Left[i - 1])
1908         return false;
1909       return true;
1910     }
1911   }
1912   // Symmetrically handle Right side.
1913   if (SplatLeft) {
1914     if (VLeft == Left[i - 1])
1915       // Preserve SplatLeft
1916       return false;
1917     if (VRight == Left[i - 1])
1918       return true;
1919   }
1920 
1921   Instruction *ILeft = dyn_cast<Instruction>(VLeft);
1922   Instruction *IRight = dyn_cast<Instruction>(VRight);
1923 
1924   // If we have "AllSameOpcodeRight", try to see if the left operands preserves
1925   // it and not the right, in this case we want to commute.
1926   if (AllSameOpcodeRight) {
1927     unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
1928     if (IRight && RightPrevOpcode == IRight->getOpcode())
1929       // Do not commute, a match on the right preserves AllSameOpcodeRight
1930       return false;
1931     if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
1932       // We have a match and may want to commute, but first check if there is
1933       // not also a match on the existing operands on the Left to preserve
1934       // AllSameOpcodeLeft, i.e. preserve the original order if possible.
1935       // (FIXME: why do we care?)
1936       if (AllSameOpcodeLeft && ILeft &&
1937           cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
1938         return false;
1939       return true;
1940     }
1941   }
1942   // Symmetrically handle Left side.
1943   if (AllSameOpcodeLeft) {
1944     unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
1945     if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
1946       return false;
1947     if (IRight && LeftPrevOpcode == IRight->getOpcode())
1948       return true;
1949   }
1950   return false;
1951 }
1952 
1953 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
1954                                              SmallVectorImpl<Value *> &Left,
1955                                              SmallVectorImpl<Value *> &Right) {
1956 
1957   if (VL.size()) {
1958     // Peel the first iteration out of the loop since there's nothing
1959     // interesting to do anyway and it simplifies the checks in the loop.
1960     auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
1961     auto VRight = cast<Instruction>(VL[0])->getOperand(1);
1962     if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
1963       // Favor having instruction to the right. FIXME: why?
1964       std::swap(VLeft, VRight);
1965     Left.push_back(VLeft);
1966     Right.push_back(VRight);
1967   }
1968 
1969   // Keep track if we have instructions with all the same opcode on one side.
1970   bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
1971   bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
1972   // Keep track if we have one side with all the same value (broadcast).
1973   bool SplatLeft = true;
1974   bool SplatRight = true;
1975 
1976   for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1977     Instruction *I = cast<Instruction>(VL[i]);
1978     assert(I->isCommutative() && "Can only process commutative instruction");
1979     // Commute to favor either a splat or maximizing having the same opcodes on
1980     // one side.
1981     if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
1982                               AllSameOpcodeRight, SplatLeft, SplatRight)) {
1983       Left.push_back(I->getOperand(1));
1984       Right.push_back(I->getOperand(0));
1985     } else {
1986       Left.push_back(I->getOperand(0));
1987       Right.push_back(I->getOperand(1));
1988     }
1989     // Update Splat* and AllSameOpcode* after the insertion.
1990     SplatRight = SplatRight && (Right[i - 1] == Right[i]);
1991     SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
1992     AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
1993                         (cast<Instruction>(Left[i - 1])->getOpcode() ==
1994                          cast<Instruction>(Left[i])->getOpcode());
1995     AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
1996                          (cast<Instruction>(Right[i - 1])->getOpcode() ==
1997                           cast<Instruction>(Right[i])->getOpcode());
1998   }
1999 
2000   // If one operand end up being broadcast, return this operand order.
2001   if (SplatRight || SplatLeft)
2002     return;
2003 
2004   const DataLayout &DL = F->getParent()->getDataLayout();
2005 
2006   // Finally check if we can get longer vectorizable chain by reordering
2007   // without breaking the good operand order detected above.
2008   // E.g. If we have something like-
2009   // load a[0]  load b[0]
2010   // load b[1]  load a[1]
2011   // load a[2]  load b[2]
2012   // load a[3]  load b[3]
2013   // Reordering the second load b[1]  load a[1] would allow us to vectorize
2014   // this code and we still retain AllSameOpcode property.
2015   // FIXME: This load reordering might break AllSameOpcode in some rare cases
2016   // such as-
2017   // add a[0],c[0]  load b[0]
2018   // add a[1],c[2]  load b[1]
2019   // b[2]           load b[2]
2020   // add a[3],c[3]  load b[3]
2021   for (unsigned j = 0; j < VL.size() - 1; ++j) {
2022     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
2023       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
2024         if (isConsecutiveAccess(L, L1, DL, *SE)) {
2025           std::swap(Left[j + 1], Right[j + 1]);
2026           continue;
2027         }
2028       }
2029     }
2030     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
2031       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
2032         if (isConsecutiveAccess(L, L1, DL, *SE)) {
2033           std::swap(Left[j + 1], Right[j + 1]);
2034           continue;
2035         }
2036       }
2037     }
2038     // else unchanged
2039   }
2040 }
2041 
2042 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
2043   Instruction *VL0 = cast<Instruction>(VL[0]);
2044   BasicBlock::iterator NextInst(VL0);
2045   ++NextInst;
2046   Builder.SetInsertPoint(VL0->getParent(), NextInst);
2047   Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
2048 }
2049 
2050 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
2051   Value *Vec = UndefValue::get(Ty);
2052   // Generate the 'InsertElement' instruction.
2053   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
2054     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
2055     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
2056       GatherSeq.insert(Insrt);
2057       CSEBlocks.insert(Insrt->getParent());
2058 
2059       // Add to our 'need-to-extract' list.
2060       if (ScalarToTreeEntry.count(VL[i])) {
2061         int Idx = ScalarToTreeEntry[VL[i]];
2062         TreeEntry *E = &VectorizableTree[Idx];
2063         // Find which lane we need to extract.
2064         int FoundLane = -1;
2065         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
2066           // Is this the lane of the scalar that we are looking for ?
2067           if (E->Scalars[Lane] == VL[i]) {
2068             FoundLane = Lane;
2069             break;
2070           }
2071         }
2072         assert(FoundLane >= 0 && "Could not find the correct lane");
2073         ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
2074       }
2075     }
2076   }
2077 
2078   return Vec;
2079 }
2080 
2081 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
2082   SmallDenseMap<Value*, int>::const_iterator Entry
2083     = ScalarToTreeEntry.find(VL[0]);
2084   if (Entry != ScalarToTreeEntry.end()) {
2085     int Idx = Entry->second;
2086     const TreeEntry *En = &VectorizableTree[Idx];
2087     if (En->isSame(VL) && En->VectorizedValue)
2088       return En->VectorizedValue;
2089   }
2090   return nullptr;
2091 }
2092 
2093 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
2094   if (ScalarToTreeEntry.count(VL[0])) {
2095     int Idx = ScalarToTreeEntry[VL[0]];
2096     TreeEntry *E = &VectorizableTree[Idx];
2097     if (E->isSame(VL))
2098       return vectorizeTree(E);
2099   }
2100 
2101   Type *ScalarTy = VL[0]->getType();
2102   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
2103     ScalarTy = SI->getValueOperand()->getType();
2104   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
2105 
2106   return Gather(VL, VecTy);
2107 }
2108 
2109 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
2110   IRBuilder<>::InsertPointGuard Guard(Builder);
2111 
2112   if (E->VectorizedValue) {
2113     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
2114     return E->VectorizedValue;
2115   }
2116 
2117   Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
2118   Type *ScalarTy = VL0->getType();
2119   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
2120     ScalarTy = SI->getValueOperand()->getType();
2121   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
2122 
2123   if (E->NeedToGather) {
2124     setInsertPointAfterBundle(E->Scalars);
2125     return Gather(E->Scalars, VecTy);
2126   }
2127 
2128   const DataLayout &DL = F->getParent()->getDataLayout();
2129   unsigned Opcode = getSameOpcode(E->Scalars);
2130 
2131   switch (Opcode) {
2132     case Instruction::PHI: {
2133       PHINode *PH = dyn_cast<PHINode>(VL0);
2134       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
2135       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2136       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
2137       E->VectorizedValue = NewPhi;
2138 
2139       // PHINodes may have multiple entries from the same block. We want to
2140       // visit every block once.
2141       SmallSet<BasicBlock*, 4> VisitedBBs;
2142 
2143       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
2144         ValueList Operands;
2145         BasicBlock *IBB = PH->getIncomingBlock(i);
2146 
2147         if (!VisitedBBs.insert(IBB).second) {
2148           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
2149           continue;
2150         }
2151 
2152         // Prepare the operand vector.
2153         for (Value *V : E->Scalars)
2154           Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
2155 
2156         Builder.SetInsertPoint(IBB->getTerminator());
2157         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
2158         Value *Vec = vectorizeTree(Operands);
2159         NewPhi->addIncoming(Vec, IBB);
2160       }
2161 
2162       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
2163              "Invalid number of incoming values");
2164       return NewPhi;
2165     }
2166 
2167     case Instruction::ExtractElement: {
2168       if (CanReuseExtract(E->Scalars)) {
2169         Value *V = VL0->getOperand(0);
2170         E->VectorizedValue = V;
2171         return V;
2172       }
2173       return Gather(E->Scalars, VecTy);
2174     }
2175     case Instruction::ZExt:
2176     case Instruction::SExt:
2177     case Instruction::FPToUI:
2178     case Instruction::FPToSI:
2179     case Instruction::FPExt:
2180     case Instruction::PtrToInt:
2181     case Instruction::IntToPtr:
2182     case Instruction::SIToFP:
2183     case Instruction::UIToFP:
2184     case Instruction::Trunc:
2185     case Instruction::FPTrunc:
2186     case Instruction::BitCast: {
2187       ValueList INVL;
2188       for (Value *V : E->Scalars)
2189         INVL.push_back(cast<Instruction>(V)->getOperand(0));
2190 
2191       setInsertPointAfterBundle(E->Scalars);
2192 
2193       Value *InVec = vectorizeTree(INVL);
2194 
2195       if (Value *V = alreadyVectorized(E->Scalars))
2196         return V;
2197 
2198       CastInst *CI = dyn_cast<CastInst>(VL0);
2199       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
2200       E->VectorizedValue = V;
2201       ++NumVectorInstructions;
2202       return V;
2203     }
2204     case Instruction::FCmp:
2205     case Instruction::ICmp: {
2206       ValueList LHSV, RHSV;
2207       for (Value *V : E->Scalars) {
2208         LHSV.push_back(cast<Instruction>(V)->getOperand(0));
2209         RHSV.push_back(cast<Instruction>(V)->getOperand(1));
2210       }
2211 
2212       setInsertPointAfterBundle(E->Scalars);
2213 
2214       Value *L = vectorizeTree(LHSV);
2215       Value *R = vectorizeTree(RHSV);
2216 
2217       if (Value *V = alreadyVectorized(E->Scalars))
2218         return V;
2219 
2220       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
2221       Value *V;
2222       if (Opcode == Instruction::FCmp)
2223         V = Builder.CreateFCmp(P0, L, R);
2224       else
2225         V = Builder.CreateICmp(P0, L, R);
2226 
2227       E->VectorizedValue = V;
2228       ++NumVectorInstructions;
2229       return V;
2230     }
2231     case Instruction::Select: {
2232       ValueList TrueVec, FalseVec, CondVec;
2233       for (Value *V : E->Scalars) {
2234         CondVec.push_back(cast<Instruction>(V)->getOperand(0));
2235         TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
2236         FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
2237       }
2238 
2239       setInsertPointAfterBundle(E->Scalars);
2240 
2241       Value *Cond = vectorizeTree(CondVec);
2242       Value *True = vectorizeTree(TrueVec);
2243       Value *False = vectorizeTree(FalseVec);
2244 
2245       if (Value *V = alreadyVectorized(E->Scalars))
2246         return V;
2247 
2248       Value *V = Builder.CreateSelect(Cond, True, False);
2249       E->VectorizedValue = V;
2250       ++NumVectorInstructions;
2251       return V;
2252     }
2253     case Instruction::Add:
2254     case Instruction::FAdd:
2255     case Instruction::Sub:
2256     case Instruction::FSub:
2257     case Instruction::Mul:
2258     case Instruction::FMul:
2259     case Instruction::UDiv:
2260     case Instruction::SDiv:
2261     case Instruction::FDiv:
2262     case Instruction::URem:
2263     case Instruction::SRem:
2264     case Instruction::FRem:
2265     case Instruction::Shl:
2266     case Instruction::LShr:
2267     case Instruction::AShr:
2268     case Instruction::And:
2269     case Instruction::Or:
2270     case Instruction::Xor: {
2271       ValueList LHSVL, RHSVL;
2272       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2273         reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2274       else
2275         for (Value *V : E->Scalars) {
2276           LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
2277           RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
2278         }
2279 
2280       setInsertPointAfterBundle(E->Scalars);
2281 
2282       Value *LHS = vectorizeTree(LHSVL);
2283       Value *RHS = vectorizeTree(RHSVL);
2284 
2285       if (LHS == RHS && isa<Instruction>(LHS)) {
2286         assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
2287       }
2288 
2289       if (Value *V = alreadyVectorized(E->Scalars))
2290         return V;
2291 
2292       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2293       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2294       E->VectorizedValue = V;
2295       propagateIRFlags(E->VectorizedValue, E->Scalars);
2296       ++NumVectorInstructions;
2297 
2298       if (Instruction *I = dyn_cast<Instruction>(V))
2299         return propagateMetadata(I, E->Scalars);
2300 
2301       return V;
2302     }
2303     case Instruction::Load: {
2304       // Loads are inserted at the head of the tree because we don't want to
2305       // sink them all the way down past store instructions.
2306       setInsertPointAfterBundle(E->Scalars);
2307 
2308       LoadInst *LI = cast<LoadInst>(VL0);
2309       Type *ScalarLoadTy = LI->getType();
2310       unsigned AS = LI->getPointerAddressSpace();
2311 
2312       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2313                                             VecTy->getPointerTo(AS));
2314 
2315       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2316       // ExternalUses list to make sure that an extract will be generated in the
2317       // future.
2318       if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2319         ExternalUses.push_back(
2320             ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2321 
2322       unsigned Alignment = LI->getAlignment();
2323       LI = Builder.CreateLoad(VecPtr);
2324       if (!Alignment) {
2325         Alignment = DL.getABITypeAlignment(ScalarLoadTy);
2326       }
2327       LI->setAlignment(Alignment);
2328       E->VectorizedValue = LI;
2329       ++NumVectorInstructions;
2330       return propagateMetadata(LI, E->Scalars);
2331     }
2332     case Instruction::Store: {
2333       StoreInst *SI = cast<StoreInst>(VL0);
2334       unsigned Alignment = SI->getAlignment();
2335       unsigned AS = SI->getPointerAddressSpace();
2336 
2337       ValueList ValueOp;
2338       for (Value *V : E->Scalars)
2339         ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
2340 
2341       setInsertPointAfterBundle(E->Scalars);
2342 
2343       Value *VecValue = vectorizeTree(ValueOp);
2344       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2345                                             VecTy->getPointerTo(AS));
2346       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2347 
2348       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2349       // ExternalUses list to make sure that an extract will be generated in the
2350       // future.
2351       if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2352         ExternalUses.push_back(
2353             ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2354 
2355       if (!Alignment) {
2356         Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
2357       }
2358       S->setAlignment(Alignment);
2359       E->VectorizedValue = S;
2360       ++NumVectorInstructions;
2361       return propagateMetadata(S, E->Scalars);
2362     }
2363     case Instruction::GetElementPtr: {
2364       setInsertPointAfterBundle(E->Scalars);
2365 
2366       ValueList Op0VL;
2367       for (Value *V : E->Scalars)
2368         Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
2369 
2370       Value *Op0 = vectorizeTree(Op0VL);
2371 
2372       std::vector<Value *> OpVecs;
2373       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2374            ++j) {
2375         ValueList OpVL;
2376         for (Value *V : E->Scalars)
2377           OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
2378 
2379         Value *OpVec = vectorizeTree(OpVL);
2380         OpVecs.push_back(OpVec);
2381       }
2382 
2383       Value *V = Builder.CreateGEP(
2384           cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
2385       E->VectorizedValue = V;
2386       ++NumVectorInstructions;
2387 
2388       if (Instruction *I = dyn_cast<Instruction>(V))
2389         return propagateMetadata(I, E->Scalars);
2390 
2391       return V;
2392     }
2393     case Instruction::Call: {
2394       CallInst *CI = cast<CallInst>(VL0);
2395       setInsertPointAfterBundle(E->Scalars);
2396       Function *FI;
2397       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
2398       Value *ScalarArg = nullptr;
2399       if (CI && (FI = CI->getCalledFunction())) {
2400         IID = FI->getIntrinsicID();
2401       }
2402       std::vector<Value *> OpVecs;
2403       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2404         ValueList OpVL;
2405         // ctlz,cttz and powi are special intrinsics whose second argument is
2406         // a scalar. This argument should not be vectorized.
2407         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2408           CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2409           ScalarArg = CEI->getArgOperand(j);
2410           OpVecs.push_back(CEI->getArgOperand(j));
2411           continue;
2412         }
2413         for (Value *V : E->Scalars) {
2414           CallInst *CEI = cast<CallInst>(V);
2415           OpVL.push_back(CEI->getArgOperand(j));
2416         }
2417 
2418         Value *OpVec = vectorizeTree(OpVL);
2419         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2420         OpVecs.push_back(OpVec);
2421       }
2422 
2423       Module *M = F->getParent();
2424       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2425       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2426       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2427       Value *V = Builder.CreateCall(CF, OpVecs);
2428 
2429       // The scalar argument uses an in-tree scalar so we add the new vectorized
2430       // call to ExternalUses list to make sure that an extract will be
2431       // generated in the future.
2432       if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2433         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2434 
2435       E->VectorizedValue = V;
2436       ++NumVectorInstructions;
2437       return V;
2438     }
2439     case Instruction::ShuffleVector: {
2440       ValueList LHSVL, RHSVL;
2441       assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
2442       reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
2443       setInsertPointAfterBundle(E->Scalars);
2444 
2445       Value *LHS = vectorizeTree(LHSVL);
2446       Value *RHS = vectorizeTree(RHSVL);
2447 
2448       if (Value *V = alreadyVectorized(E->Scalars))
2449         return V;
2450 
2451       // Create a vector of LHS op1 RHS
2452       BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2453       Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2454 
2455       // Create a vector of LHS op2 RHS
2456       Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2457       BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2458       Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2459 
2460       // Create shuffle to take alternate operations from the vector.
2461       // Also, gather up odd and even scalar ops to propagate IR flags to
2462       // each vector operation.
2463       ValueList OddScalars, EvenScalars;
2464       unsigned e = E->Scalars.size();
2465       SmallVector<Constant *, 8> Mask(e);
2466       for (unsigned i = 0; i < e; ++i) {
2467         if (i & 1) {
2468           Mask[i] = Builder.getInt32(e + i);
2469           OddScalars.push_back(E->Scalars[i]);
2470         } else {
2471           Mask[i] = Builder.getInt32(i);
2472           EvenScalars.push_back(E->Scalars[i]);
2473         }
2474       }
2475 
2476       Value *ShuffleMask = ConstantVector::get(Mask);
2477       propagateIRFlags(V0, EvenScalars);
2478       propagateIRFlags(V1, OddScalars);
2479 
2480       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2481       E->VectorizedValue = V;
2482       ++NumVectorInstructions;
2483       if (Instruction *I = dyn_cast<Instruction>(V))
2484         return propagateMetadata(I, E->Scalars);
2485 
2486       return V;
2487     }
2488     default:
2489     llvm_unreachable("unknown inst");
2490   }
2491   return nullptr;
2492 }
2493 
2494 Value *BoUpSLP::vectorizeTree() {
2495 
2496   // All blocks must be scheduled before any instructions are inserted.
2497   for (auto &BSIter : BlocksSchedules) {
2498     scheduleBlock(BSIter.second.get());
2499   }
2500 
2501   Builder.SetInsertPoint(&F->getEntryBlock().front());
2502   vectorizeTree(&VectorizableTree[0]);
2503 
2504   DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2505 
2506   // Extract all of the elements with the external uses.
2507   for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2508        it != e; ++it) {
2509     Value *Scalar = it->Scalar;
2510     llvm::User *User = it->User;
2511 
2512     // Skip users that we already RAUW. This happens when one instruction
2513     // has multiple uses of the same value.
2514     if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
2515         Scalar->user_end())
2516       continue;
2517     assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2518 
2519     int Idx = ScalarToTreeEntry[Scalar];
2520     TreeEntry *E = &VectorizableTree[Idx];
2521     assert(!E->NeedToGather && "Extracting from a gather list");
2522 
2523     Value *Vec = E->VectorizedValue;
2524     assert(Vec && "Can't find vectorizable value");
2525 
2526     Value *Lane = Builder.getInt32(it->Lane);
2527     // Generate extracts for out-of-tree users.
2528     // Find the insertion point for the extractelement lane.
2529     if (isa<Instruction>(Vec)){
2530       if (PHINode *PH = dyn_cast<PHINode>(User)) {
2531         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2532           if (PH->getIncomingValue(i) == Scalar) {
2533             Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2534             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2535             CSEBlocks.insert(PH->getIncomingBlock(i));
2536             PH->setOperand(i, Ex);
2537           }
2538         }
2539       } else {
2540         Builder.SetInsertPoint(cast<Instruction>(User));
2541         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2542         CSEBlocks.insert(cast<Instruction>(User)->getParent());
2543         User->replaceUsesOfWith(Scalar, Ex);
2544      }
2545     } else {
2546       Builder.SetInsertPoint(&F->getEntryBlock().front());
2547       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2548       CSEBlocks.insert(&F->getEntryBlock());
2549       User->replaceUsesOfWith(Scalar, Ex);
2550     }
2551 
2552     DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2553   }
2554 
2555   // For each vectorized value:
2556   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2557     TreeEntry *Entry = &VectorizableTree[EIdx];
2558 
2559     // For each lane:
2560     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2561       Value *Scalar = Entry->Scalars[Lane];
2562       // No need to handle users of gathered values.
2563       if (Entry->NeedToGather)
2564         continue;
2565 
2566       assert(Entry->VectorizedValue && "Can't find vectorizable value");
2567 
2568       Type *Ty = Scalar->getType();
2569       if (!Ty->isVoidTy()) {
2570 #ifndef NDEBUG
2571         for (User *U : Scalar->users()) {
2572           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2573 
2574           assert((ScalarToTreeEntry.count(U) ||
2575                   // It is legal to replace users in the ignorelist by undef.
2576                   (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2577                    UserIgnoreList.end())) &&
2578                  "Replacing out-of-tree value with undef");
2579         }
2580 #endif
2581         Value *Undef = UndefValue::get(Ty);
2582         Scalar->replaceAllUsesWith(Undef);
2583       }
2584       DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2585       eraseInstruction(cast<Instruction>(Scalar));
2586     }
2587   }
2588 
2589   Builder.ClearInsertionPoint();
2590 
2591   return VectorizableTree[0].VectorizedValue;
2592 }
2593 
2594 void BoUpSLP::optimizeGatherSequence() {
2595   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2596         << " gather sequences instructions.\n");
2597   // LICM InsertElementInst sequences.
2598   for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
2599        e = GatherSeq.end(); it != e; ++it) {
2600     InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
2601 
2602     if (!Insert)
2603       continue;
2604 
2605     // Check if this block is inside a loop.
2606     Loop *L = LI->getLoopFor(Insert->getParent());
2607     if (!L)
2608       continue;
2609 
2610     // Check if it has a preheader.
2611     BasicBlock *PreHeader = L->getLoopPreheader();
2612     if (!PreHeader)
2613       continue;
2614 
2615     // If the vector or the element that we insert into it are
2616     // instructions that are defined in this basic block then we can't
2617     // hoist this instruction.
2618     Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2619     Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2620     if (CurrVec && L->contains(CurrVec))
2621       continue;
2622     if (NewElem && L->contains(NewElem))
2623       continue;
2624 
2625     // We can hoist this instruction. Move it to the pre-header.
2626     Insert->moveBefore(PreHeader->getTerminator());
2627   }
2628 
2629   // Make a list of all reachable blocks in our CSE queue.
2630   SmallVector<const DomTreeNode *, 8> CSEWorkList;
2631   CSEWorkList.reserve(CSEBlocks.size());
2632   for (BasicBlock *BB : CSEBlocks)
2633     if (DomTreeNode *N = DT->getNode(BB)) {
2634       assert(DT->isReachableFromEntry(N));
2635       CSEWorkList.push_back(N);
2636     }
2637 
2638   // Sort blocks by domination. This ensures we visit a block after all blocks
2639   // dominating it are visited.
2640   std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2641                    [this](const DomTreeNode *A, const DomTreeNode *B) {
2642     return DT->properlyDominates(A, B);
2643   });
2644 
2645   // Perform O(N^2) search over the gather sequences and merge identical
2646   // instructions. TODO: We can further optimize this scan if we split the
2647   // instructions into different buckets based on the insert lane.
2648   SmallVector<Instruction *, 16> Visited;
2649   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2650     assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2651            "Worklist not sorted properly!");
2652     BasicBlock *BB = (*I)->getBlock();
2653     // For all instructions in blocks containing gather sequences:
2654     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2655       Instruction *In = &*it++;
2656       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2657         continue;
2658 
2659       // Check if we can replace this instruction with any of the
2660       // visited instructions.
2661       for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
2662                                                     ve = Visited.end();
2663            v != ve; ++v) {
2664         if (In->isIdenticalTo(*v) &&
2665             DT->dominates((*v)->getParent(), In->getParent())) {
2666           In->replaceAllUsesWith(*v);
2667           eraseInstruction(In);
2668           In = nullptr;
2669           break;
2670         }
2671       }
2672       if (In) {
2673         assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
2674         Visited.push_back(In);
2675       }
2676     }
2677   }
2678   CSEBlocks.clear();
2679   GatherSeq.clear();
2680 }
2681 
2682 // Groups the instructions to a bundle (which is then a single scheduling entity)
2683 // and schedules instructions until the bundle gets ready.
2684 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2685                                                  BoUpSLP *SLP) {
2686   if (isa<PHINode>(VL[0]))
2687     return true;
2688 
2689   // Initialize the instruction bundle.
2690   Instruction *OldScheduleEnd = ScheduleEnd;
2691   ScheduleData *PrevInBundle = nullptr;
2692   ScheduleData *Bundle = nullptr;
2693   bool ReSchedule = false;
2694   DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
2695 
2696   // Make sure that the scheduling region contains all
2697   // instructions of the bundle.
2698   for (Value *V : VL) {
2699     if (!extendSchedulingRegion(V))
2700       return false;
2701   }
2702 
2703   for (Value *V : VL) {
2704     ScheduleData *BundleMember = getScheduleData(V);
2705     assert(BundleMember &&
2706            "no ScheduleData for bundle member (maybe not in same basic block)");
2707     if (BundleMember->IsScheduled) {
2708       // A bundle member was scheduled as single instruction before and now
2709       // needs to be scheduled as part of the bundle. We just get rid of the
2710       // existing schedule.
2711       DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
2712                    << " was already scheduled\n");
2713       ReSchedule = true;
2714     }
2715     assert(BundleMember->isSchedulingEntity() &&
2716            "bundle member already part of other bundle");
2717     if (PrevInBundle) {
2718       PrevInBundle->NextInBundle = BundleMember;
2719     } else {
2720       Bundle = BundleMember;
2721     }
2722     BundleMember->UnscheduledDepsInBundle = 0;
2723     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2724 
2725     // Group the instructions to a bundle.
2726     BundleMember->FirstInBundle = Bundle;
2727     PrevInBundle = BundleMember;
2728   }
2729   if (ScheduleEnd != OldScheduleEnd) {
2730     // The scheduling region got new instructions at the lower end (or it is a
2731     // new region for the first bundle). This makes it necessary to
2732     // recalculate all dependencies.
2733     // It is seldom that this needs to be done a second time after adding the
2734     // initial bundle to the region.
2735     for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2736       ScheduleData *SD = getScheduleData(I);
2737       SD->clearDependencies();
2738     }
2739     ReSchedule = true;
2740   }
2741   if (ReSchedule) {
2742     resetSchedule();
2743     initialFillReadyList(ReadyInsts);
2744   }
2745 
2746   DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2747                << BB->getName() << "\n");
2748 
2749   calculateDependencies(Bundle, true, SLP);
2750 
2751   // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2752   // means that there are no cyclic dependencies and we can schedule it.
2753   // Note that's important that we don't "schedule" the bundle yet (see
2754   // cancelScheduling).
2755   while (!Bundle->isReady() && !ReadyInsts.empty()) {
2756 
2757     ScheduleData *pickedSD = ReadyInsts.back();
2758     ReadyInsts.pop_back();
2759 
2760     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2761       schedule(pickedSD, ReadyInsts);
2762     }
2763   }
2764   if (!Bundle->isReady()) {
2765     cancelScheduling(VL);
2766     return false;
2767   }
2768   return true;
2769 }
2770 
2771 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
2772   if (isa<PHINode>(VL[0]))
2773     return;
2774 
2775   ScheduleData *Bundle = getScheduleData(VL[0]);
2776   DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
2777   assert(!Bundle->IsScheduled &&
2778          "Can't cancel bundle which is already scheduled");
2779   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2780          "tried to unbundle something which is not a bundle");
2781 
2782   // Un-bundle: make single instructions out of the bundle.
2783   ScheduleData *BundleMember = Bundle;
2784   while (BundleMember) {
2785     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
2786     BundleMember->FirstInBundle = BundleMember;
2787     ScheduleData *Next = BundleMember->NextInBundle;
2788     BundleMember->NextInBundle = nullptr;
2789     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2790     if (BundleMember->UnscheduledDepsInBundle == 0) {
2791       ReadyInsts.insert(BundleMember);
2792     }
2793     BundleMember = Next;
2794   }
2795 }
2796 
2797 bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
2798   if (getScheduleData(V))
2799     return true;
2800   Instruction *I = dyn_cast<Instruction>(V);
2801   assert(I && "bundle member must be an instruction");
2802   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
2803   if (!ScheduleStart) {
2804     // It's the first instruction in the new region.
2805     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
2806     ScheduleStart = I;
2807     ScheduleEnd = I->getNextNode();
2808     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2809     DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
2810     return true;
2811   }
2812   // Search up and down at the same time, because we don't know if the new
2813   // instruction is above or below the existing scheduling region.
2814   BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());
2815   BasicBlock::reverse_iterator UpperEnd = BB->rend();
2816   BasicBlock::iterator DownIter(ScheduleEnd);
2817   BasicBlock::iterator LowerEnd = BB->end();
2818   for (;;) {
2819     if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
2820       DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
2821       return false;
2822     }
2823 
2824     if (UpIter != UpperEnd) {
2825       if (&*UpIter == I) {
2826         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
2827         ScheduleStart = I;
2828         DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
2829         return true;
2830       }
2831       UpIter++;
2832     }
2833     if (DownIter != LowerEnd) {
2834       if (&*DownIter == I) {
2835         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
2836                          nullptr);
2837         ScheduleEnd = I->getNextNode();
2838         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2839         DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
2840         return true;
2841       }
2842       DownIter++;
2843     }
2844     assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
2845            "instruction not found in block");
2846   }
2847   return true;
2848 }
2849 
2850 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
2851                                                 Instruction *ToI,
2852                                                 ScheduleData *PrevLoadStore,
2853                                                 ScheduleData *NextLoadStore) {
2854   ScheduleData *CurrentLoadStore = PrevLoadStore;
2855   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
2856     ScheduleData *SD = ScheduleDataMap[I];
2857     if (!SD) {
2858       // Allocate a new ScheduleData for the instruction.
2859       if (ChunkPos >= ChunkSize) {
2860         ScheduleDataChunks.push_back(
2861             llvm::make_unique<ScheduleData[]>(ChunkSize));
2862         ChunkPos = 0;
2863       }
2864       SD = &(ScheduleDataChunks.back()[ChunkPos++]);
2865       ScheduleDataMap[I] = SD;
2866       SD->Inst = I;
2867     }
2868     assert(!isInSchedulingRegion(SD) &&
2869            "new ScheduleData already in scheduling region");
2870     SD->init(SchedulingRegionID);
2871 
2872     if (I->mayReadOrWriteMemory()) {
2873       // Update the linked list of memory accessing instructions.
2874       if (CurrentLoadStore) {
2875         CurrentLoadStore->NextLoadStore = SD;
2876       } else {
2877         FirstLoadStoreInRegion = SD;
2878       }
2879       CurrentLoadStore = SD;
2880     }
2881   }
2882   if (NextLoadStore) {
2883     if (CurrentLoadStore)
2884       CurrentLoadStore->NextLoadStore = NextLoadStore;
2885   } else {
2886     LastLoadStoreInRegion = CurrentLoadStore;
2887   }
2888 }
2889 
2890 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
2891                                                      bool InsertInReadyList,
2892                                                      BoUpSLP *SLP) {
2893   assert(SD->isSchedulingEntity());
2894 
2895   SmallVector<ScheduleData *, 10> WorkList;
2896   WorkList.push_back(SD);
2897 
2898   while (!WorkList.empty()) {
2899     ScheduleData *SD = WorkList.back();
2900     WorkList.pop_back();
2901 
2902     ScheduleData *BundleMember = SD;
2903     while (BundleMember) {
2904       assert(isInSchedulingRegion(BundleMember));
2905       if (!BundleMember->hasValidDependencies()) {
2906 
2907         DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
2908         BundleMember->Dependencies = 0;
2909         BundleMember->resetUnscheduledDeps();
2910 
2911         // Handle def-use chain dependencies.
2912         for (User *U : BundleMember->Inst->users()) {
2913           if (isa<Instruction>(U)) {
2914             ScheduleData *UseSD = getScheduleData(U);
2915             if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
2916               BundleMember->Dependencies++;
2917               ScheduleData *DestBundle = UseSD->FirstInBundle;
2918               if (!DestBundle->IsScheduled) {
2919                 BundleMember->incrementUnscheduledDeps(1);
2920               }
2921               if (!DestBundle->hasValidDependencies()) {
2922                 WorkList.push_back(DestBundle);
2923               }
2924             }
2925           } else {
2926             // I'm not sure if this can ever happen. But we need to be safe.
2927             // This lets the instruction/bundle never be scheduled and
2928             // eventually disable vectorization.
2929             BundleMember->Dependencies++;
2930             BundleMember->incrementUnscheduledDeps(1);
2931           }
2932         }
2933 
2934         // Handle the memory dependencies.
2935         ScheduleData *DepDest = BundleMember->NextLoadStore;
2936         if (DepDest) {
2937           Instruction *SrcInst = BundleMember->Inst;
2938           MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
2939           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
2940           unsigned numAliased = 0;
2941           unsigned DistToSrc = 1;
2942 
2943           while (DepDest) {
2944             assert(isInSchedulingRegion(DepDest));
2945 
2946             // We have two limits to reduce the complexity:
2947             // 1) AliasedCheckLimit: It's a small limit to reduce calls to
2948             //    SLP->isAliased (which is the expensive part in this loop).
2949             // 2) MaxMemDepDistance: It's for very large blocks and it aborts
2950             //    the whole loop (even if the loop is fast, it's quadratic).
2951             //    It's important for the loop break condition (see below) to
2952             //    check this limit even between two read-only instructions.
2953             if (DistToSrc >= MaxMemDepDistance ||
2954                     ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
2955                      (numAliased >= AliasedCheckLimit ||
2956                       SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
2957 
2958               // We increment the counter only if the locations are aliased
2959               // (instead of counting all alias checks). This gives a better
2960               // balance between reduced runtime and accurate dependencies.
2961               numAliased++;
2962 
2963               DepDest->MemoryDependencies.push_back(BundleMember);
2964               BundleMember->Dependencies++;
2965               ScheduleData *DestBundle = DepDest->FirstInBundle;
2966               if (!DestBundle->IsScheduled) {
2967                 BundleMember->incrementUnscheduledDeps(1);
2968               }
2969               if (!DestBundle->hasValidDependencies()) {
2970                 WorkList.push_back(DestBundle);
2971               }
2972             }
2973             DepDest = DepDest->NextLoadStore;
2974 
2975             // Example, explaining the loop break condition: Let's assume our
2976             // starting instruction is i0 and MaxMemDepDistance = 3.
2977             //
2978             //                      +--------v--v--v
2979             //             i0,i1,i2,i3,i4,i5,i6,i7,i8
2980             //             +--------^--^--^
2981             //
2982             // MaxMemDepDistance let us stop alias-checking at i3 and we add
2983             // dependencies from i0 to i3,i4,.. (even if they are not aliased).
2984             // Previously we already added dependencies from i3 to i6,i7,i8
2985             // (because of MaxMemDepDistance). As we added a dependency from
2986             // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
2987             // and we can abort this loop at i6.
2988             if (DistToSrc >= 2 * MaxMemDepDistance)
2989                 break;
2990             DistToSrc++;
2991           }
2992         }
2993       }
2994       BundleMember = BundleMember->NextInBundle;
2995     }
2996     if (InsertInReadyList && SD->isReady()) {
2997       ReadyInsts.push_back(SD);
2998       DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
2999     }
3000   }
3001 }
3002 
3003 void BoUpSLP::BlockScheduling::resetSchedule() {
3004   assert(ScheduleStart &&
3005          "tried to reset schedule on block which has not been scheduled");
3006   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3007     ScheduleData *SD = getScheduleData(I);
3008     assert(isInSchedulingRegion(SD));
3009     SD->IsScheduled = false;
3010     SD->resetUnscheduledDeps();
3011   }
3012   ReadyInsts.clear();
3013 }
3014 
3015 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
3016 
3017   if (!BS->ScheduleStart)
3018     return;
3019 
3020   DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
3021 
3022   BS->resetSchedule();
3023 
3024   // For the real scheduling we use a more sophisticated ready-list: it is
3025   // sorted by the original instruction location. This lets the final schedule
3026   // be as  close as possible to the original instruction order.
3027   struct ScheduleDataCompare {
3028     bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
3029       return SD2->SchedulingPriority < SD1->SchedulingPriority;
3030     }
3031   };
3032   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
3033 
3034   // Ensure that all dependency data is updated and fill the ready-list with
3035   // initial instructions.
3036   int Idx = 0;
3037   int NumToSchedule = 0;
3038   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
3039        I = I->getNextNode()) {
3040     ScheduleData *SD = BS->getScheduleData(I);
3041     assert(
3042         SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
3043         "scheduler and vectorizer have different opinion on what is a bundle");
3044     SD->FirstInBundle->SchedulingPriority = Idx++;
3045     if (SD->isSchedulingEntity()) {
3046       BS->calculateDependencies(SD, false, this);
3047       NumToSchedule++;
3048     }
3049   }
3050   BS->initialFillReadyList(ReadyInsts);
3051 
3052   Instruction *LastScheduledInst = BS->ScheduleEnd;
3053 
3054   // Do the "real" scheduling.
3055   while (!ReadyInsts.empty()) {
3056     ScheduleData *picked = *ReadyInsts.begin();
3057     ReadyInsts.erase(ReadyInsts.begin());
3058 
3059     // Move the scheduled instruction(s) to their dedicated places, if not
3060     // there yet.
3061     ScheduleData *BundleMember = picked;
3062     while (BundleMember) {
3063       Instruction *pickedInst = BundleMember->Inst;
3064       if (LastScheduledInst->getNextNode() != pickedInst) {
3065         BS->BB->getInstList().remove(pickedInst);
3066         BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
3067                                      pickedInst);
3068       }
3069       LastScheduledInst = pickedInst;
3070       BundleMember = BundleMember->NextInBundle;
3071     }
3072 
3073     BS->schedule(picked, ReadyInsts);
3074     NumToSchedule--;
3075   }
3076   assert(NumToSchedule == 0 && "could not schedule all instructions");
3077 
3078   // Avoid duplicate scheduling of the block.
3079   BS->ScheduleStart = nullptr;
3080 }
3081 
3082 unsigned BoUpSLP::getVectorElementSize(Value *V) {
3083   auto &DL = F->getParent()->getDataLayout();
3084 
3085   // If V is a store, just return the width of the stored value without
3086   // traversing the expression tree. This is the common case.
3087   if (auto *Store = dyn_cast<StoreInst>(V))
3088     return DL.getTypeSizeInBits(Store->getValueOperand()->getType());
3089 
3090   // If V is not a store, we can traverse the expression tree to find loads
3091   // that feed it. The type of the loaded value may indicate a more suitable
3092   // width than V's type. We want to base the vector element size on the width
3093   // of memory operations where possible.
3094   SmallVector<Instruction *, 16> Worklist;
3095   SmallPtrSet<Instruction *, 16> Visited;
3096   if (auto *I = dyn_cast<Instruction>(V))
3097     Worklist.push_back(I);
3098 
3099   // Traverse the expression tree in bottom-up order looking for loads. If we
3100   // encounter an instruciton we don't yet handle, we give up.
3101   auto MaxWidth = 0u;
3102   auto FoundUnknownInst = false;
3103   while (!Worklist.empty() && !FoundUnknownInst) {
3104     auto *I = Worklist.pop_back_val();
3105     Visited.insert(I);
3106 
3107     // We should only be looking at scalar instructions here. If the current
3108     // instruction has a vector type, give up.
3109     auto *Ty = I->getType();
3110     if (isa<VectorType>(Ty))
3111       FoundUnknownInst = true;
3112 
3113     // If the current instruction is a load, update MaxWidth to reflect the
3114     // width of the loaded value.
3115     else if (isa<LoadInst>(I))
3116       MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty));
3117 
3118     // Otherwise, we need to visit the operands of the instruction. We only
3119     // handle the interesting cases from buildTree here. If an operand is an
3120     // instruction we haven't yet visited, we add it to the worklist.
3121     else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
3122              isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
3123       for (Use &U : I->operands())
3124         if (auto *J = dyn_cast<Instruction>(U.get()))
3125           if (!Visited.count(J))
3126             Worklist.push_back(J);
3127     }
3128 
3129     // If we don't yet handle the instruction, give up.
3130     else
3131       FoundUnknownInst = true;
3132   }
3133 
3134   // If we didn't encounter a memory access in the expression tree, or if we
3135   // gave up for some reason, just return the width of V.
3136   if (!MaxWidth || FoundUnknownInst)
3137     return DL.getTypeSizeInBits(V->getType());
3138 
3139   // Otherwise, return the maximum width we found.
3140   return MaxWidth;
3141 }
3142 
3143 /// The SLPVectorizer Pass.
3144 struct SLPVectorizer : public FunctionPass {
3145   typedef SmallVector<StoreInst *, 8> StoreList;
3146   typedef MapVector<Value *, StoreList> StoreListMap;
3147   typedef SmallVector<WeakVH, 8> WeakVHList;
3148   typedef MapVector<Value *, WeakVHList> WeakVHListMap;
3149 
3150   /// Pass identification, replacement for typeid
3151   static char ID;
3152 
3153   explicit SLPVectorizer() : FunctionPass(ID) {
3154     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
3155   }
3156 
3157   ScalarEvolution *SE;
3158   TargetTransformInfo *TTI;
3159   TargetLibraryInfo *TLI;
3160   AliasAnalysis *AA;
3161   LoopInfo *LI;
3162   DominatorTree *DT;
3163   AssumptionCache *AC;
3164 
3165   bool runOnFunction(Function &F) override {
3166     if (skipOptnoneFunction(F))
3167       return false;
3168 
3169     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
3170     TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
3171     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
3172     TLI = TLIP ? &TLIP->getTLI() : nullptr;
3173     AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3174     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
3175     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3176     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3177 
3178     Stores.clear();
3179     GEPs.clear();
3180     bool Changed = false;
3181 
3182     // If the target claims to have no vector registers don't attempt
3183     // vectorization.
3184     if (!TTI->getNumberOfRegisters(true))
3185       return false;
3186 
3187     // Use the vector register size specified by the target unless overridden
3188     // by a command-line option.
3189     // TODO: It would be better to limit the vectorization factor based on
3190     //       data type rather than just register size. For example, x86 AVX has
3191     //       256-bit registers, but it does not support integer operations
3192     //       at that width (that requires AVX2).
3193     if (MaxVectorRegSizeOption.getNumOccurrences())
3194       MaxVecRegSize = MaxVectorRegSizeOption;
3195     else
3196       MaxVecRegSize = TTI->getRegisterBitWidth(true);
3197 
3198     // Don't vectorize when the attribute NoImplicitFloat is used.
3199     if (F.hasFnAttribute(Attribute::NoImplicitFloat))
3200       return false;
3201 
3202     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
3203 
3204     // Use the bottom up slp vectorizer to construct chains that start with
3205     // store instructions.
3206     BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
3207 
3208     // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
3209     // delete instructions.
3210 
3211     // Scan the blocks in the function in post order.
3212     for (auto BB : post_order(&F.getEntryBlock())) {
3213       collectSeedInstructions(BB);
3214 
3215       // Vectorize trees that end at stores.
3216       if (NumStores > 0) {
3217         DEBUG(dbgs() << "SLP: Found " << NumStores << " stores.\n");
3218         Changed |= vectorizeStoreChains(R);
3219       }
3220 
3221       // Vectorize trees that end at reductions.
3222       Changed |= vectorizeChainsInBlock(BB, R);
3223 
3224       // Vectorize the index computations of getelementptr instructions. This
3225       // is primarily intended to catch gather-like idioms ending at
3226       // non-consecutive loads.
3227       if (NumGEPs > 0) {
3228         DEBUG(dbgs() << "SLP: Found " << NumGEPs << " GEPs.\n");
3229         Changed |= vectorizeGEPIndices(BB, R);
3230       }
3231     }
3232 
3233     if (Changed) {
3234       R.optimizeGatherSequence();
3235       DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
3236       DEBUG(verifyFunction(F));
3237     }
3238     return Changed;
3239   }
3240 
3241   void getAnalysisUsage(AnalysisUsage &AU) const override {
3242     FunctionPass::getAnalysisUsage(AU);
3243     AU.addRequired<AssumptionCacheTracker>();
3244     AU.addRequired<ScalarEvolutionWrapperPass>();
3245     AU.addRequired<AAResultsWrapperPass>();
3246     AU.addRequired<TargetTransformInfoWrapperPass>();
3247     AU.addRequired<LoopInfoWrapperPass>();
3248     AU.addRequired<DominatorTreeWrapperPass>();
3249     AU.addPreserved<LoopInfoWrapperPass>();
3250     AU.addPreserved<DominatorTreeWrapperPass>();
3251     AU.addPreserved<AAResultsWrapperPass>();
3252     AU.addPreserved<GlobalsAAWrapperPass>();
3253     AU.setPreservesCFG();
3254   }
3255 
3256 private:
3257   /// \brief Collect store and getelementptr instructions and organize them
3258   /// according to the underlying object of their pointer operands. We sort the
3259   /// instructions by their underlying objects to reduce the cost of
3260   /// consecutive access queries.
3261   ///
3262   /// TODO: We can further reduce this cost if we flush the chain creation
3263   ///       every time we run into a memory barrier.
3264   void collectSeedInstructions(BasicBlock *BB);
3265 
3266   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
3267   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
3268 
3269   /// \brief Try to vectorize a list of operands.
3270   /// \@param BuildVector A list of users to ignore for the purpose of
3271   ///                     scheduling and that don't need extracting.
3272   /// \returns true if a value was vectorized.
3273   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3274                           ArrayRef<Value *> BuildVector = None,
3275                           bool allowReorder = false);
3276 
3277   /// \brief Try to vectorize a chain that may start at the operands of \V;
3278   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
3279 
3280   /// \brief Vectorize the store instructions collected in Stores.
3281   bool vectorizeStoreChains(BoUpSLP &R);
3282 
3283   /// \brief Vectorize the index computations of the getelementptr instructions
3284   /// collected in GEPs.
3285   bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R);
3286 
3287   /// \brief Scan the basic block and look for patterns that are likely to start
3288   /// a vectorization chain.
3289   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
3290 
3291   bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
3292                            BoUpSLP &R, unsigned VecRegSize);
3293 
3294   bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
3295                        BoUpSLP &R);
3296 
3297   /// The store instructions in a basic block organized by base pointer.
3298   StoreListMap Stores;
3299 
3300   /// The getelementptr instructions in a basic block organized by base pointer.
3301   WeakVHListMap GEPs;
3302 
3303   /// The number of store instructions in a basic block.
3304   unsigned NumStores;
3305 
3306   /// The number of getelementptr instructions in a basic block.
3307   unsigned NumGEPs;
3308 
3309   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3310 };
3311 
3312 /// \brief Check that the Values in the slice in VL array are still existent in
3313 /// the WeakVH array.
3314 /// Vectorization of part of the VL array may cause later values in the VL array
3315 /// to become invalid. We track when this has happened in the WeakVH array.
3316 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
3317                                unsigned SliceBegin, unsigned SliceSize) {
3318   VL = VL.slice(SliceBegin, SliceSize);
3319   VH = VH.slice(SliceBegin, SliceSize);
3320   return !std::equal(VL.begin(), VL.end(), VH.begin());
3321 }
3322 
3323 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
3324                                         int CostThreshold, BoUpSLP &R,
3325                                         unsigned VecRegSize) {
3326   unsigned ChainLen = Chain.size();
3327   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
3328         << "\n");
3329   unsigned Sz = R.getVectorElementSize(Chain[0]);
3330   unsigned VF = VecRegSize / Sz;
3331 
3332   if (!isPowerOf2_32(Sz) || VF < 2)
3333     return false;
3334 
3335   // Keep track of values that were deleted by vectorizing in the loop below.
3336   SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
3337 
3338   bool Changed = false;
3339   // Look for profitable vectorizable trees at all offsets, starting at zero.
3340   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
3341     if (i + VF > e)
3342       break;
3343 
3344     // Check that a previous iteration of this loop did not delete the Value.
3345     if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
3346       continue;
3347 
3348     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
3349           << "\n");
3350     ArrayRef<Value *> Operands = Chain.slice(i, VF);
3351 
3352     R.buildTree(Operands);
3353 
3354     int Cost = R.getTreeCost();
3355 
3356     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
3357     if (Cost < CostThreshold) {
3358       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3359       R.vectorizeTree();
3360 
3361       // Move to the next bundle.
3362       i += VF - 1;
3363       Changed = true;
3364     }
3365   }
3366 
3367   return Changed;
3368 }
3369 
3370 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
3371                                     int costThreshold, BoUpSLP &R) {
3372   SetVector<StoreInst *> Heads, Tails;
3373   SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
3374 
3375   // We may run into multiple chains that merge into a single chain. We mark the
3376   // stores that we vectorized so that we don't visit the same store twice.
3377   BoUpSLP::ValueSet VectorizedStores;
3378   bool Changed = false;
3379 
3380   // Do a quadratic search on all of the given stores and find
3381   // all of the pairs of stores that follow each other.
3382   SmallVector<unsigned, 16> IndexQueue;
3383   for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3384     const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
3385     IndexQueue.clear();
3386     // If a store has multiple consecutive store candidates, search Stores
3387     // array according to the sequence: from i+1 to e, then from i-1 to 0.
3388     // This is because usually pairing with immediate succeeding or preceding
3389     // candidate create the best chance to find slp vectorization opportunity.
3390     unsigned j = 0;
3391     for (j = i + 1; j < e; ++j)
3392       IndexQueue.push_back(j);
3393     for (j = i; j > 0; --j)
3394       IndexQueue.push_back(j - 1);
3395 
3396     for (auto &k : IndexQueue) {
3397       if (isConsecutiveAccess(Stores[i], Stores[k], DL, *SE)) {
3398         Tails.insert(Stores[k]);
3399         Heads.insert(Stores[i]);
3400         ConsecutiveChain[Stores[i]] = Stores[k];
3401         break;
3402       }
3403     }
3404   }
3405 
3406   // For stores that start but don't end a link in the chain:
3407   for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
3408        it != e; ++it) {
3409     if (Tails.count(*it))
3410       continue;
3411 
3412     // We found a store instr that starts a chain. Now follow the chain and try
3413     // to vectorize it.
3414     BoUpSLP::ValueList Operands;
3415     StoreInst *I = *it;
3416     // Collect the chain into a list.
3417     while (Tails.count(I) || Heads.count(I)) {
3418       if (VectorizedStores.count(I))
3419         break;
3420       Operands.push_back(I);
3421       // Move to the next value in the chain.
3422       I = ConsecutiveChain[I];
3423     }
3424 
3425     // FIXME: Is division-by-2 the correct step? Should we assert that the
3426     // register size is a power-of-2?
3427     for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
3428       if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
3429         // Mark the vectorized stores so that we don't vectorize them again.
3430         VectorizedStores.insert(Operands.begin(), Operands.end());
3431         Changed = true;
3432         break;
3433       }
3434     }
3435   }
3436 
3437   return Changed;
3438 }
3439 
3440 void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) {
3441 
3442   // Initialize the collections. We will make a single pass over the block.
3443   Stores.clear();
3444   GEPs.clear();
3445   NumStores = NumGEPs = 0;
3446   const DataLayout &DL = BB->getModule()->getDataLayout();
3447 
3448   // Visit the store and getelementptr instructions in BB and organize them in
3449   // Stores and GEPs according to the underlying objects of their pointer
3450   // operands.
3451   for (Instruction &I : *BB) {
3452 
3453     // Ignore store instructions that are volatile or have a pointer operand
3454     // that doesn't point to a scalar type.
3455     if (auto *SI = dyn_cast<StoreInst>(&I)) {
3456       if (!SI->isSimple())
3457         continue;
3458       if (!isValidElementType(SI->getValueOperand()->getType()))
3459         continue;
3460       Stores[GetUnderlyingObject(SI->getPointerOperand(), DL)].push_back(SI);
3461       ++NumStores;
3462     }
3463 
3464     // Ignore getelementptr instructions that have more than one index, a
3465     // constant index, or a pointer operand that doesn't point to a scalar
3466     // type.
3467     else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
3468       auto Idx = GEP->idx_begin()->get();
3469       if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
3470         continue;
3471       if (!isValidElementType(Idx->getType()))
3472         continue;
3473       GEPs[GetUnderlyingObject(GEP->getPointerOperand(), DL)].push_back(GEP);
3474       ++NumGEPs;
3475     }
3476   }
3477 }
3478 
3479 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3480   if (!A || !B)
3481     return false;
3482   Value *VL[] = { A, B };
3483   return tryToVectorizeList(VL, R, None, true);
3484 }
3485 
3486 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3487                                        ArrayRef<Value *> BuildVector,
3488                                        bool allowReorder) {
3489   if (VL.size() < 2)
3490     return false;
3491 
3492   DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
3493 
3494   // Check that all of the parts are scalar instructions of the same type.
3495   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3496   if (!I0)
3497     return false;
3498 
3499   unsigned Opcode0 = I0->getOpcode();
3500 
3501   // FIXME: Register size should be a parameter to this function, so we can
3502   // try different vectorization factors.
3503   unsigned Sz = R.getVectorElementSize(I0);
3504   unsigned VF = MinVecRegSize / Sz;
3505 
3506   for (Value *V : VL) {
3507     Type *Ty = V->getType();
3508     if (!isValidElementType(Ty))
3509       return false;
3510     Instruction *Inst = dyn_cast<Instruction>(V);
3511     if (!Inst || Inst->getOpcode() != Opcode0)
3512       return false;
3513   }
3514 
3515   bool Changed = false;
3516 
3517   // Keep track of values that were deleted by vectorizing in the loop below.
3518   SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3519 
3520   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3521     unsigned OpsWidth = 0;
3522 
3523     if (i + VF > e)
3524       OpsWidth = e - i;
3525     else
3526       OpsWidth = VF;
3527 
3528     if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3529       break;
3530 
3531     // Check that a previous iteration of this loop did not delete the Value.
3532     if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3533       continue;
3534 
3535     DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3536                  << "\n");
3537     ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
3538 
3539     ArrayRef<Value *> BuildVectorSlice;
3540     if (!BuildVector.empty())
3541       BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3542 
3543     R.buildTree(Ops, BuildVectorSlice);
3544     // TODO: check if we can allow reordering also for other cases than
3545     // tryToVectorizePair()
3546     if (allowReorder && R.shouldReorder()) {
3547       assert(Ops.size() == 2);
3548       assert(BuildVectorSlice.empty());
3549       Value *ReorderedOps[] = { Ops[1], Ops[0] };
3550       R.buildTree(ReorderedOps, None);
3551     }
3552     int Cost = R.getTreeCost();
3553 
3554     if (Cost < -SLPCostThreshold) {
3555       DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3556       Value *VectorizedRoot = R.vectorizeTree();
3557 
3558       // Reconstruct the build vector by extracting the vectorized root. This
3559       // way we handle the case where some elements of the vector are undefined.
3560       //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3561       if (!BuildVectorSlice.empty()) {
3562         // The insert point is the last build vector instruction. The vectorized
3563         // root will precede it. This guarantees that we get an instruction. The
3564         // vectorized tree could have been constant folded.
3565         Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3566         unsigned VecIdx = 0;
3567         for (auto &V : BuildVectorSlice) {
3568           IRBuilder<true, NoFolder> Builder(
3569               InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter));
3570           InsertElementInst *IE = cast<InsertElementInst>(V);
3571           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3572               VectorizedRoot, Builder.getInt32(VecIdx++)));
3573           IE->setOperand(1, Extract);
3574           IE->removeFromParent();
3575           IE->insertAfter(Extract);
3576           InsertAfter = IE;
3577         }
3578       }
3579       // Move to the next bundle.
3580       i += VF - 1;
3581       Changed = true;
3582     }
3583   }
3584 
3585   return Changed;
3586 }
3587 
3588 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
3589   if (!V)
3590     return false;
3591 
3592   // Try to vectorize V.
3593   if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
3594     return true;
3595 
3596   BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
3597   BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
3598   // Try to skip B.
3599   if (B && B->hasOneUse()) {
3600     BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
3601     BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
3602     if (tryToVectorizePair(A, B0, R)) {
3603       return true;
3604     }
3605     if (tryToVectorizePair(A, B1, R)) {
3606       return true;
3607     }
3608   }
3609 
3610   // Try to skip A.
3611   if (A && A->hasOneUse()) {
3612     BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
3613     BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
3614     if (tryToVectorizePair(A0, B, R)) {
3615       return true;
3616     }
3617     if (tryToVectorizePair(A1, B, R)) {
3618       return true;
3619     }
3620   }
3621   return 0;
3622 }
3623 
3624 /// \brief Generate a shuffle mask to be used in a reduction tree.
3625 ///
3626 /// \param VecLen The length of the vector to be reduced.
3627 /// \param NumEltsToRdx The number of elements that should be reduced in the
3628 ///        vector.
3629 /// \param IsPairwise Whether the reduction is a pairwise or splitting
3630 ///        reduction. A pairwise reduction will generate a mask of
3631 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
3632 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
3633 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
3634 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
3635                                    bool IsPairwise, bool IsLeft,
3636                                    IRBuilder<> &Builder) {
3637   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
3638 
3639   SmallVector<Constant *, 32> ShuffleMask(
3640       VecLen, UndefValue::get(Builder.getInt32Ty()));
3641 
3642   if (IsPairwise)
3643     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
3644     for (unsigned i = 0; i != NumEltsToRdx; ++i)
3645       ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
3646   else
3647     // Move the upper half of the vector to the lower half.
3648     for (unsigned i = 0; i != NumEltsToRdx; ++i)
3649       ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
3650 
3651   return ConstantVector::get(ShuffleMask);
3652 }
3653 
3654 
3655 /// Model horizontal reductions.
3656 ///
3657 /// A horizontal reduction is a tree of reduction operations (currently add and
3658 /// fadd) that has operations that can be put into a vector as its leaf.
3659 /// For example, this tree:
3660 ///
3661 /// mul mul mul mul
3662 ///  \  /    \  /
3663 ///   +       +
3664 ///    \     /
3665 ///       +
3666 /// This tree has "mul" as its reduced values and "+" as its reduction
3667 /// operations. A reduction might be feeding into a store or a binary operation
3668 /// feeding a phi.
3669 ///    ...
3670 ///    \  /
3671 ///     +
3672 ///     |
3673 ///  phi +=
3674 ///
3675 ///  Or:
3676 ///    ...
3677 ///    \  /
3678 ///     +
3679 ///     |
3680 ///   *p =
3681 ///
3682 class HorizontalReduction {
3683   SmallVector<Value *, 16> ReductionOps;
3684   SmallVector<Value *, 32> ReducedVals;
3685 
3686   BinaryOperator *ReductionRoot;
3687   PHINode *ReductionPHI;
3688 
3689   /// The opcode of the reduction.
3690   unsigned ReductionOpcode;
3691   /// The opcode of the values we perform a reduction on.
3692   unsigned ReducedValueOpcode;
3693   /// Should we model this reduction as a pairwise reduction tree or a tree that
3694   /// splits the vector in halves and adds those halves.
3695   bool IsPairwiseReduction;
3696 
3697 public:
3698   /// The width of one full horizontal reduction operation.
3699   unsigned ReduxWidth;
3700 
3701   HorizontalReduction()
3702     : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3703     ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {}
3704 
3705   /// \brief Try to find a reduction tree.
3706   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
3707     assert((!Phi ||
3708             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
3709            "Thi phi needs to use the binary operator");
3710 
3711     // We could have a initial reductions that is not an add.
3712     //  r *= v1 + v2 + v3 + v4
3713     // In such a case start looking for a tree rooted in the first '+'.
3714     if (Phi) {
3715       if (B->getOperand(0) == Phi) {
3716         Phi = nullptr;
3717         B = dyn_cast<BinaryOperator>(B->getOperand(1));
3718       } else if (B->getOperand(1) == Phi) {
3719         Phi = nullptr;
3720         B = dyn_cast<BinaryOperator>(B->getOperand(0));
3721       }
3722     }
3723 
3724     if (!B)
3725       return false;
3726 
3727     Type *Ty = B->getType();
3728     if (!isValidElementType(Ty))
3729       return false;
3730 
3731     const DataLayout &DL = B->getModule()->getDataLayout();
3732     ReductionOpcode = B->getOpcode();
3733     ReducedValueOpcode = 0;
3734     // FIXME: Register size should be a parameter to this function, so we can
3735     // try different vectorization factors.
3736     ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
3737     ReductionRoot = B;
3738     ReductionPHI = Phi;
3739 
3740     if (ReduxWidth < 4)
3741       return false;
3742 
3743     // We currently only support adds.
3744     if (ReductionOpcode != Instruction::Add &&
3745         ReductionOpcode != Instruction::FAdd)
3746       return false;
3747 
3748     // Post order traverse the reduction tree starting at B. We only handle true
3749     // trees containing only binary operators or selects.
3750     SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
3751     Stack.push_back(std::make_pair(B, 0));
3752     while (!Stack.empty()) {
3753       Instruction *TreeN = Stack.back().first;
3754       unsigned EdgeToVist = Stack.back().second++;
3755       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
3756 
3757       // Only handle trees in the current basic block.
3758       if (TreeN->getParent() != B->getParent())
3759         return false;
3760 
3761       // Each tree node needs to have one user except for the ultimate
3762       // reduction.
3763       if (!TreeN->hasOneUse() && TreeN != B)
3764         return false;
3765 
3766       // Postorder vist.
3767       if (EdgeToVist == 2 || IsReducedValue) {
3768         if (IsReducedValue) {
3769           // Make sure that the opcodes of the operations that we are going to
3770           // reduce match.
3771           if (!ReducedValueOpcode)
3772             ReducedValueOpcode = TreeN->getOpcode();
3773           else if (ReducedValueOpcode != TreeN->getOpcode())
3774             return false;
3775           ReducedVals.push_back(TreeN);
3776         } else {
3777           // We need to be able to reassociate the adds.
3778           if (!TreeN->isAssociative())
3779             return false;
3780           ReductionOps.push_back(TreeN);
3781         }
3782         // Retract.
3783         Stack.pop_back();
3784         continue;
3785       }
3786 
3787       // Visit left or right.
3788       Value *NextV = TreeN->getOperand(EdgeToVist);
3789       // We currently only allow BinaryOperator's and SelectInst's as reduction
3790       // values in our tree.
3791       if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV))
3792         Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));
3793       else if (NextV != Phi)
3794         return false;
3795     }
3796     return true;
3797   }
3798 
3799   /// \brief Attempt to vectorize the tree found by
3800   /// matchAssociativeReduction.
3801   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
3802     if (ReducedVals.empty())
3803       return false;
3804 
3805     unsigned NumReducedVals = ReducedVals.size();
3806     if (NumReducedVals < ReduxWidth)
3807       return false;
3808 
3809     Value *VectorizedTree = nullptr;
3810     IRBuilder<> Builder(ReductionRoot);
3811     FastMathFlags Unsafe;
3812     Unsafe.setUnsafeAlgebra();
3813     Builder.setFastMathFlags(Unsafe);
3814     unsigned i = 0;
3815 
3816     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
3817       V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
3818 
3819       // Estimate cost.
3820       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
3821       if (Cost >= -SLPCostThreshold)
3822         break;
3823 
3824       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
3825                    << ". (HorRdx)\n");
3826 
3827       // Vectorize a tree.
3828       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
3829       Value *VectorizedRoot = V.vectorizeTree();
3830 
3831       // Emit a reduction.
3832       Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
3833       if (VectorizedTree) {
3834         Builder.SetCurrentDebugLocation(Loc);
3835         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3836                                      ReducedSubTree, "bin.rdx");
3837       } else
3838         VectorizedTree = ReducedSubTree;
3839     }
3840 
3841     if (VectorizedTree) {
3842       // Finish the reduction.
3843       for (; i < NumReducedVals; ++i) {
3844         Builder.SetCurrentDebugLocation(
3845           cast<Instruction>(ReducedVals[i])->getDebugLoc());
3846         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3847                                      ReducedVals[i]);
3848       }
3849       // Update users.
3850       if (ReductionPHI) {
3851         assert(ReductionRoot && "Need a reduction operation");
3852         ReductionRoot->setOperand(0, VectorizedTree);
3853         ReductionRoot->setOperand(1, ReductionPHI);
3854       } else
3855         ReductionRoot->replaceAllUsesWith(VectorizedTree);
3856     }
3857     return VectorizedTree != nullptr;
3858   }
3859 
3860   unsigned numReductionValues() const {
3861     return ReducedVals.size();
3862   }
3863 
3864 private:
3865   /// \brief Calculate the cost of a reduction.
3866   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
3867     Type *ScalarTy = FirstReducedVal->getType();
3868     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
3869 
3870     int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
3871     int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
3872 
3873     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
3874     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
3875 
3876     int ScalarReduxCost =
3877         ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
3878 
3879     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
3880                  << " for reduction that starts with " << *FirstReducedVal
3881                  << " (It is a "
3882                  << (IsPairwiseReduction ? "pairwise" : "splitting")
3883                  << " reduction)\n");
3884 
3885     return VecReduxCost - ScalarReduxCost;
3886   }
3887 
3888   static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
3889                             Value *R, const Twine &Name = "") {
3890     if (Opcode == Instruction::FAdd)
3891       return Builder.CreateFAdd(L, R, Name);
3892     return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
3893   }
3894 
3895   /// \brief Emit a horizontal reduction of the vectorized value.
3896   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
3897     assert(VectorizedValue && "Need to have a vectorized tree node");
3898     assert(isPowerOf2_32(ReduxWidth) &&
3899            "We only handle power-of-two reductions for now");
3900 
3901     Value *TmpVec = VectorizedValue;
3902     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
3903       if (IsPairwiseReduction) {
3904         Value *LeftMask =
3905           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
3906         Value *RightMask =
3907           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
3908 
3909         Value *LeftShuf = Builder.CreateShuffleVector(
3910           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
3911         Value *RightShuf = Builder.CreateShuffleVector(
3912           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
3913           "rdx.shuf.r");
3914         TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
3915                              "bin.rdx");
3916       } else {
3917         Value *UpperHalf =
3918           createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
3919         Value *Shuf = Builder.CreateShuffleVector(
3920           TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
3921         TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
3922       }
3923     }
3924 
3925     // The result is in the first element of the vector.
3926     return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
3927   }
3928 };
3929 
3930 /// \brief Recognize construction of vectors like
3931 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
3932 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
3933 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
3934 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
3935 ///
3936 /// Returns true if it matches
3937 ///
3938 static bool findBuildVector(InsertElementInst *FirstInsertElem,
3939                             SmallVectorImpl<Value *> &BuildVector,
3940                             SmallVectorImpl<Value *> &BuildVectorOpds) {
3941   if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
3942     return false;
3943 
3944   InsertElementInst *IE = FirstInsertElem;
3945   while (true) {
3946     BuildVector.push_back(IE);
3947     BuildVectorOpds.push_back(IE->getOperand(1));
3948 
3949     if (IE->use_empty())
3950       return false;
3951 
3952     InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
3953     if (!NextUse)
3954       return true;
3955 
3956     // If this isn't the final use, make sure the next insertelement is the only
3957     // use. It's OK if the final constructed vector is used multiple times
3958     if (!IE->hasOneUse())
3959       return false;
3960 
3961     IE = NextUse;
3962   }
3963 
3964   return false;
3965 }
3966 
3967 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
3968   return V->getType() < V2->getType();
3969 }
3970 
3971 /// \brief Try and get a reduction value from a phi node.
3972 ///
3973 /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
3974 /// if they come from either \p ParentBB or a containing loop latch.
3975 ///
3976 /// \returns A candidate reduction value if possible, or \code nullptr \endcode
3977 /// if not possible.
3978 static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
3979                                 BasicBlock *ParentBB, LoopInfo *LI) {
3980   // There are situations where the reduction value is not dominated by the
3981   // reduction phi. Vectorizing such cases has been reported to cause
3982   // miscompiles. See PR25787.
3983   auto DominatedReduxValue = [&](Value *R) {
3984     return (
3985         dyn_cast<Instruction>(R) &&
3986         DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
3987   };
3988 
3989   Value *Rdx = nullptr;
3990 
3991   // Return the incoming value if it comes from the same BB as the phi node.
3992   if (P->getIncomingBlock(0) == ParentBB) {
3993     Rdx = P->getIncomingValue(0);
3994   } else if (P->getIncomingBlock(1) == ParentBB) {
3995     Rdx = P->getIncomingValue(1);
3996   }
3997 
3998   if (Rdx && DominatedReduxValue(Rdx))
3999     return Rdx;
4000 
4001   // Otherwise, check whether we have a loop latch to look at.
4002   Loop *BBL = LI->getLoopFor(ParentBB);
4003   if (!BBL)
4004     return nullptr;
4005   BasicBlock *BBLatch = BBL->getLoopLatch();
4006   if (!BBLatch)
4007     return nullptr;
4008 
4009   // There is a loop latch, return the incoming value if it comes from
4010   // that. This reduction pattern occassionaly turns up.
4011   if (P->getIncomingBlock(0) == BBLatch) {
4012     Rdx = P->getIncomingValue(0);
4013   } else if (P->getIncomingBlock(1) == BBLatch) {
4014     Rdx = P->getIncomingValue(1);
4015   }
4016 
4017   if (Rdx && DominatedReduxValue(Rdx))
4018     return Rdx;
4019 
4020   return nullptr;
4021 }
4022 
4023 /// \brief Attempt to reduce a horizontal reduction.
4024 /// If it is legal to match a horizontal reduction feeding
4025 /// the phi node P with reduction operators BI, then check if it
4026 /// can be done.
4027 /// \returns true if a horizontal reduction was matched and reduced.
4028 /// \returns false if a horizontal reduction was not matched.
4029 static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
4030                                         BoUpSLP &R, TargetTransformInfo *TTI) {
4031   if (!ShouldVectorizeHor)
4032     return false;
4033 
4034   HorizontalReduction HorRdx;
4035   if (!HorRdx.matchAssociativeReduction(P, BI))
4036     return false;
4037 
4038   // If there is a sufficient number of reduction values, reduce
4039   // to a nearby power-of-2. Can safely generate oversized
4040   // vectors and rely on the backend to split them to legal sizes.
4041   HorRdx.ReduxWidth =
4042     std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
4043 
4044   return HorRdx.tryToReduce(R, TTI);
4045 }
4046 
4047 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
4048   bool Changed = false;
4049   SmallVector<Value *, 4> Incoming;
4050   SmallSet<Value *, 16> VisitedInstrs;
4051 
4052   bool HaveVectorizedPhiNodes = true;
4053   while (HaveVectorizedPhiNodes) {
4054     HaveVectorizedPhiNodes = false;
4055 
4056     // Collect the incoming values from the PHIs.
4057     Incoming.clear();
4058     for (Instruction &I : *BB) {
4059       PHINode *P = dyn_cast<PHINode>(&I);
4060       if (!P)
4061         break;
4062 
4063       if (!VisitedInstrs.count(P))
4064         Incoming.push_back(P);
4065     }
4066 
4067     // Sort by type.
4068     std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
4069 
4070     // Try to vectorize elements base on their type.
4071     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
4072                                            E = Incoming.end();
4073          IncIt != E;) {
4074 
4075       // Look for the next elements with the same type.
4076       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
4077       while (SameTypeIt != E &&
4078              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
4079         VisitedInstrs.insert(*SameTypeIt);
4080         ++SameTypeIt;
4081       }
4082 
4083       // Try to vectorize them.
4084       unsigned NumElts = (SameTypeIt - IncIt);
4085       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
4086       if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
4087         // Success start over because instructions might have been changed.
4088         HaveVectorizedPhiNodes = true;
4089         Changed = true;
4090         break;
4091       }
4092 
4093       // Start over at the next instruction of a different type (or the end).
4094       IncIt = SameTypeIt;
4095     }
4096   }
4097 
4098   VisitedInstrs.clear();
4099 
4100   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
4101     // We may go through BB multiple times so skip the one we have checked.
4102     if (!VisitedInstrs.insert(&*it).second)
4103       continue;
4104 
4105     if (isa<DbgInfoIntrinsic>(it))
4106       continue;
4107 
4108     // Try to vectorize reductions that use PHINodes.
4109     if (PHINode *P = dyn_cast<PHINode>(it)) {
4110       // Check that the PHI is a reduction PHI.
4111       if (P->getNumIncomingValues() != 2)
4112         return Changed;
4113 
4114       Value *Rdx = getReductionValue(DT, P, BB, LI);
4115 
4116       // Check if this is a Binary Operator.
4117       BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
4118       if (!BI)
4119         continue;
4120 
4121       // Try to match and vectorize a horizontal reduction.
4122       if (canMatchHorizontalReduction(P, BI, R, TTI)) {
4123         Changed = true;
4124         it = BB->begin();
4125         e = BB->end();
4126         continue;
4127       }
4128 
4129      Value *Inst = BI->getOperand(0);
4130       if (Inst == P)
4131         Inst = BI->getOperand(1);
4132 
4133       if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
4134         // We would like to start over since some instructions are deleted
4135         // and the iterator may become invalid value.
4136         Changed = true;
4137         it = BB->begin();
4138         e = BB->end();
4139         continue;
4140       }
4141 
4142       continue;
4143     }
4144 
4145     if (ShouldStartVectorizeHorAtStore)
4146       if (StoreInst *SI = dyn_cast<StoreInst>(it))
4147         if (BinaryOperator *BinOp =
4148                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
4149           if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) ||
4150               tryToVectorize(BinOp, R)) {
4151             Changed = true;
4152             it = BB->begin();
4153             e = BB->end();
4154             continue;
4155           }
4156         }
4157 
4158     // Try to vectorize horizontal reductions feeding into a return.
4159     if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
4160       if (RI->getNumOperands() != 0)
4161         if (BinaryOperator *BinOp =
4162                 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
4163           DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
4164           if (tryToVectorizePair(BinOp->getOperand(0),
4165                                  BinOp->getOperand(1), R)) {
4166             Changed = true;
4167             it = BB->begin();
4168             e = BB->end();
4169             continue;
4170           }
4171         }
4172 
4173     // Try to vectorize trees that start at compare instructions.
4174     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
4175       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
4176         Changed = true;
4177         // We would like to start over since some instructions are deleted
4178         // and the iterator may become invalid value.
4179         it = BB->begin();
4180         e = BB->end();
4181         continue;
4182       }
4183 
4184       for (int i = 0; i < 2; ++i) {
4185         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
4186           if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
4187             Changed = true;
4188             // We would like to start over since some instructions are deleted
4189             // and the iterator may become invalid value.
4190             it = BB->begin();
4191             e = BB->end();
4192             break;
4193           }
4194         }
4195       }
4196       continue;
4197     }
4198 
4199     // Try to vectorize trees that start at insertelement instructions.
4200     if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
4201       SmallVector<Value *, 16> BuildVector;
4202       SmallVector<Value *, 16> BuildVectorOpds;
4203       if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
4204         continue;
4205 
4206       // Vectorize starting with the build vector operands ignoring the
4207       // BuildVector instructions for the purpose of scheduling and user
4208       // extraction.
4209       if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
4210         Changed = true;
4211         it = BB->begin();
4212         e = BB->end();
4213       }
4214 
4215       continue;
4216     }
4217   }
4218 
4219   return Changed;
4220 }
4221 
4222 bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
4223   auto Changed = false;
4224   for (auto &Entry : GEPs) {
4225 
4226     // If the getelementptr list has fewer than two elements, there's nothing
4227     // to do.
4228     if (Entry.second.size() < 2)
4229       continue;
4230 
4231     DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
4232                  << Entry.second.size() << ".\n");
4233 
4234     // We process the getelementptr list in chunks of 16 (like we do for
4235     // stores) to minimize compile-time.
4236     for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
4237       auto Len = std::min<unsigned>(BE - BI, 16);
4238       auto GEPList = makeArrayRef(&Entry.second[BI], Len);
4239 
4240       // Initialize a set a candidate getelementptrs. Note that we use a
4241       // SetVector here to preserve program order. If the index computations
4242       // are vectorizable and begin with loads, we want to minimize the chance
4243       // of having to reorder them later.
4244       SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
4245 
4246       // Some of the candidates may have already been vectorized after we
4247       // initially collected them. If so, the WeakVHs will have nullified the
4248       // values, so remove them from the set of candidates.
4249       Candidates.remove(nullptr);
4250 
4251       // Remove from the set of candidates all pairs of getelementptrs with
4252       // constant differences. Such getelementptrs are likely not good
4253       // candidates for vectorization in a bottom-up phase since one can be
4254       // computed from the other. We also ensure all candidate getelementptr
4255       // indices are unique.
4256       for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
4257         auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
4258         if (!Candidates.count(GEPI))
4259           continue;
4260         auto *SCEVI = SE->getSCEV(GEPList[I]);
4261         for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
4262           auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
4263           auto *SCEVJ = SE->getSCEV(GEPList[J]);
4264           if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
4265             Candidates.remove(GEPList[I]);
4266             Candidates.remove(GEPList[J]);
4267           } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
4268             Candidates.remove(GEPList[J]);
4269           }
4270         }
4271       }
4272 
4273       // We break out of the above computation as soon as we know there are
4274       // fewer than two candidates remaining.
4275       if (Candidates.size() < 2)
4276         continue;
4277 
4278       // Add the single, non-constant index of each candidate to the bundle. We
4279       // ensured the indices met these constraints when we originally collected
4280       // the getelementptrs.
4281       SmallVector<Value *, 16> Bundle(Candidates.size());
4282       auto BundleIndex = 0u;
4283       for (auto *V : Candidates) {
4284         auto *GEP = cast<GetElementPtrInst>(V);
4285         auto *GEPIdx = GEP->idx_begin()->get();
4286         assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
4287         Bundle[BundleIndex++] = GEPIdx;
4288       }
4289 
4290       // Try and vectorize the indices. We are currently only interested in
4291       // gather-like cases of the form:
4292       //
4293       // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
4294       //
4295       // where the loads of "a", the loads of "b", and the subtractions can be
4296       // performed in parallel. It's likely that detecting this pattern in a
4297       // bottom-up phase will be simpler and less costly than building a
4298       // full-blown top-down phase beginning at the consecutive loads.
4299       Changed |= tryToVectorizeList(Bundle, R);
4300     }
4301   }
4302   return Changed;
4303 }
4304 
4305 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
4306   bool Changed = false;
4307   // Attempt to sort and vectorize each of the store-groups.
4308   for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
4309        ++it) {
4310     if (it->second.size() < 2)
4311       continue;
4312 
4313     DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
4314           << it->second.size() << ".\n");
4315 
4316     // Process the stores in chunks of 16.
4317     // TODO: The limit of 16 inhibits greater vectorization factors.
4318     //       For example, AVX2 supports v32i8. Increasing this limit, however,
4319     //       may cause a significant compile-time increase.
4320     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
4321       unsigned Len = std::min<unsigned>(CE - CI, 16);
4322       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
4323                                  -SLPCostThreshold, R);
4324     }
4325   }
4326   return Changed;
4327 }
4328 
4329 } // end anonymous namespace
4330 
4331 char SLPVectorizer::ID = 0;
4332 static const char lv_name[] = "SLP Vectorizer";
4333 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
4334 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
4335 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
4336 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
4337 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
4338 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4339 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
4340 
4341 namespace llvm {
4342 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
4343 }
4344