1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR. Legalization of the IR is done
12 // in the codegen. However, the vectorizes uses (will use) the codegen
13 // interfaces to generate IR that is likely to result in an optimal binary.
14 //
15 // The loop vectorizer combines consecutive loop iteration into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 //    of the vectorization.
23 // 3. SingleBlockLoopVectorizer - A unit that performs the actual
24 //    widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 //    of vectorization. It decides on the optimal vector width, which
27 //    can be one, if vectorization is not profitable.
28 //===----------------------------------------------------------------------===//
29 //
30 // The reduction-variable vectorization is based on the paper:
31 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
32 //
33 // Variable uniformity checks are inspired by:
34 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
35 //
36 // Other ideas/concepts are from:
37 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
38 //
39 //===----------------------------------------------------------------------===//
40 #define LV_NAME "loop-vectorize"
41 #define DEBUG_TYPE LV_NAME
42 #include "llvm/Constants.h"
43 #include "llvm/DerivedTypes.h"
44 #include "llvm/Instructions.h"
45 #include "llvm/LLVMContext.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Analysis/LoopPass.h"
48 #include "llvm/Value.h"
49 #include "llvm/Function.h"
50 #include "llvm/Analysis/Verifier.h"
51 #include "llvm/Module.h"
52 #include "llvm/Type.h"
53 #include "llvm/ADT/SmallVector.h"
54 #include "llvm/ADT/StringExtras.h"
55 #include "llvm/Analysis/AliasAnalysis.h"
56 #include "llvm/Analysis/AliasSetTracker.h"
57 #include "llvm/Analysis/ScalarEvolution.h"
58 #include "llvm/Analysis/Dominators.h"
59 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
60 #include "llvm/Analysis/ScalarEvolutionExpander.h"
61 #include "llvm/Analysis/LoopInfo.h"
62 #include "llvm/Analysis/ValueTracking.h"
63 #include "llvm/Transforms/Scalar.h"
64 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
65 #include "llvm/TargetTransformInfo.h"
66 #include "llvm/Support/CommandLine.h"
67 #include "llvm/Support/Debug.h"
68 #include "llvm/Support/raw_ostream.h"
69 #include "llvm/DataLayout.h"
70 #include "llvm/Transforms/Utils/Local.h"
71 #include <algorithm>
72 using namespace llvm;
73 
74 static cl::opt<unsigned>
75 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
76           cl::desc("Set the default vectorization width. Zero is autoselect."));
77 
78 namespace {
79 
80 // Forward declarations.
81 class LoopVectorizationLegality;
82 class LoopVectorizationCostModel;
83 
84 /// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
85 /// block to a specified vectorization factor (VF).
86 /// This class performs the widening of scalars into vectors, or multiple
87 /// scalars. This class also implements the following features:
88 /// * It inserts an epilogue loop for handling loops that don't have iteration
89 ///   counts that are known to be a multiple of the vectorization factor.
90 /// * It handles the code generation for reduction variables.
91 /// * Scalarization (implementation using scalars) of un-vectorizable
92 ///   instructions.
93 /// SingleBlockLoopVectorizer does not perform any vectorization-legality
94 /// checks, and relies on the caller to check for the different legality
95 /// aspects. The SingleBlockLoopVectorizer relies on the
96 /// LoopVectorizationLegality class to provide information about the induction
97 /// and reduction variables that were found to a given vectorization factor.
98 class SingleBlockLoopVectorizer {
99 public:
100   /// Ctor.
101   SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
102                             DominatorTree *dt, LPPassManager *Lpm,
103                             unsigned VecWidth):
104   OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth),
105   Builder(Se->getContext()), Induction(0), OldInduction(0) { }
106 
107   // Perform the actual loop widening (vectorization).
108   void vectorize(LoopVectorizationLegality *Legal) {
109     ///Create a new empty loop. Unlink the old loop and connect the new one.
110     createEmptyLoop(Legal);
111     /// Widen each instruction in the old loop to a new one in the new loop.
112     /// Use the Legality module to find the induction and reduction variables.
113     vectorizeLoop(Legal);
114     // register the new loop.
115     updateAnalysis();
116  }
117 
118 private:
119   /// Create an empty loop, based on the loop ranges of the old loop.
120   void createEmptyLoop(LoopVectorizationLegality *Legal);
121   /// Copy and widen the instructions from the old loop.
122   void vectorizeLoop(LoopVectorizationLegality *Legal);
123   /// Insert the new loop to the loop hierarchy and pass manager.
124   void updateAnalysis();
125 
126   /// This instruction is un-vectorizable. Implement it as a sequence
127   /// of scalars.
128   void scalarizeInstruction(Instruction *Instr);
129 
130   /// Create a broadcast instruction. This method generates a broadcast
131   /// instruction (shuffle) for loop invariant values and for the induction
132   /// value. If this is the induction variable then we extend it to N, N+1, ...
133   /// this is needed because each iteration in the loop corresponds to a SIMD
134   /// element.
135   Value *getBroadcastInstrs(Value *V);
136 
137   /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
138   /// for each element in the vector. Starting from zero.
139   Value *getConsecutiveVector(Value* Val);
140 
141   /// When we go over instructions in the basic block we rely on previous
142   /// values within the current basic block or on loop invariant values.
143   /// When we widen (vectorize) values we place them in the map. If the values
144   /// are not within the map, they have to be loop invariant, so we simply
145   /// broadcast them into a vector.
146   Value *getVectorValue(Value *V);
147 
148   /// Get a uniform vector of constant integers. We use this to get
149   /// vectors of ones and zeros for the reduction code.
150   Constant* getUniformVector(unsigned Val, Type* ScalarTy);
151 
152   typedef DenseMap<Value*, Value*> ValueMap;
153 
154   /// The original loop.
155   Loop *OrigLoop;
156   // Scev analysis to use.
157   ScalarEvolution *SE;
158   // Loop Info.
159   LoopInfo *LI;
160   // Dominator Tree.
161   DominatorTree *DT;
162   // Loop Pass Manager;
163   LPPassManager *LPM;
164   // The vectorization factor to use.
165   unsigned VF;
166 
167   // The builder that we use
168   IRBuilder<> Builder;
169 
170   // --- Vectorization state ---
171 
172   /// The vector-loop preheader.
173   BasicBlock *LoopVectorPreHeader;
174   /// The scalar-loop preheader.
175   BasicBlock *LoopScalarPreHeader;
176   /// Middle Block between the vector and the scalar.
177   BasicBlock *LoopMiddleBlock;
178   ///The ExitBlock of the scalar loop.
179   BasicBlock *LoopExitBlock;
180   ///The vector loop body.
181   BasicBlock *LoopVectorBody;
182   ///The scalar loop body.
183   BasicBlock *LoopScalarBody;
184   ///The first bypass block.
185   BasicBlock *LoopBypassBlock;
186 
187   /// The new Induction variable which was added to the new block.
188   PHINode *Induction;
189   /// The induction variable of the old basic block.
190   PHINode *OldInduction;
191   // Maps scalars to widened vectors.
192   ValueMap WidenMap;
193 };
194 
195 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
196 /// to what vectorization factor.
197 /// This class does not look at the profitability of vectorization, only the
198 /// legality. This class has two main kinds of checks:
199 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
200 ///   will change the order of memory accesses in a way that will change the
201 ///   correctness of the program.
202 /// * Scalars checks - The code in canVectorizeBlock checks for a number
203 ///   of different conditions, such as the availability of a single induction
204 ///   variable, that all types are supported and vectorize-able, etc.
205 /// This code reflects the capabilities of SingleBlockLoopVectorizer.
206 /// This class is also used by SingleBlockLoopVectorizer for identifying
207 /// induction variable and the different reduction variables.
208 class LoopVectorizationLegality {
209 public:
210   LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
211   TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
212 
213   /// This represents the kinds of reductions that we support.
214   /// We use the enum values to hold the 'identity' value for
215   /// each operand. This value does not change the result if applied.
216   enum ReductionKind {
217     NoReduction = -1, /// Not a reduction.
218     IntegerAdd  = 0,  /// Sum of numbers.
219     IntegerMult = 1,  /// Product of numbers.
220     IntegerOr   = 2,  /// Bitwise or logical OR of numbers.
221     IntegerAnd  = 3,  /// Bitwise or logical AND of numbers.
222     IntegerXor  = 4   /// Bitwise or logical XOR of numbers.
223   };
224 
225   /// This POD struct holds information about reduction variables.
226   struct ReductionDescriptor {
227     // Default C'tor
228     ReductionDescriptor():
229     StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
230 
231     // C'tor.
232     ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
233     StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
234 
235     // The starting value of the reduction.
236     // It does not have to be zero!
237     Value *StartValue;
238     // The instruction who's value is used outside the loop.
239     Instruction *LoopExitInstr;
240     // The kind of the reduction.
241     ReductionKind Kind;
242   };
243 
244   /// ReductionList contains the reduction descriptors for all
245   /// of the reductions that were found in the loop.
246   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
247 
248   /// Returns true if it is legal to vectorize this loop.
249   /// This does not mean that it is profitable to vectorize this
250   /// loop, only that it is legal to do so.
251   bool canVectorize();
252 
253   /// Returns the Induction variable.
254   PHINode *getInduction() {return Induction;}
255 
256   /// Returns the reduction variables found in the loop.
257   ReductionList *getReductionVars() { return &Reductions; }
258 
259   /// Check if the pointer returned by this GEP is consecutive
260   /// when the index is vectorized. This happens when the last
261   /// index of the GEP is consecutive, like the induction variable.
262   /// This check allows us to vectorize A[idx] into a wide load/store.
263   bool isConsecutiveGep(Value *Ptr);
264 
265   /// Returns true if this instruction will remain scalar after vectorization.
266   bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
267 
268 private:
269   /// Check if a single basic block loop is vectorizable.
270   /// At this point we know that this is a loop with a constant trip count
271   /// and we only need to check individual instructions.
272   bool canVectorizeBlock(BasicBlock &BB);
273 
274   /// When we vectorize loops we may change the order in which
275   /// we read and write from memory. This method checks if it is
276   /// legal to vectorize the code, considering only memory constrains.
277   /// Returns true if BB is vectorizable
278   bool canVectorizeMemory(BasicBlock &BB);
279 
280   /// Returns True, if 'Phi' is the kind of reduction variable for type
281   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
282   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
283   /// Returns true if the instruction I can be a reduction variable of type
284   /// 'Kind'.
285   bool isReductionInstr(Instruction *I, ReductionKind Kind);
286   /// Returns True, if 'Phi' is an induction variable.
287   bool isInductionVariable(PHINode *Phi);
288 
289   /// The loop that we evaluate.
290   Loop *TheLoop;
291   /// Scev analysis.
292   ScalarEvolution *SE;
293   /// DataLayout analysis.
294   DataLayout *DL;
295 
296   //  ---  vectorization state --- //
297 
298   /// Holds the induction variable.
299   PHINode *Induction;
300   /// Holds the reduction variables.
301   ReductionList Reductions;
302   /// Allowed outside users. This holds the reduction
303   /// vars which can be accessed from outside the loop.
304   SmallPtrSet<Value*, 4> AllowedExit;
305   /// This set holds the variables which are known to be uniform after
306   /// vectorization.
307   SmallPtrSet<Instruction*, 4> Uniforms;
308 };
309 
310 /// LoopVectorizationCostModel - estimates the expected speedups due to
311 /// vectorization.
312 /// In many cases vectorization is not profitable. This can happen because
313 /// of a number of reasons. In this class we mainly attempt to predict
314 /// the expected speedup/slowdowns due to the supported instruction set.
315 /// We use the VectorTargetTransformInfo to query the different backends
316 /// for the cost of different operations.
317 class LoopVectorizationCostModel {
318 public:
319   /// C'tor.
320   LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
321                              LoopVectorizationLegality *Leg,
322                              const VectorTargetTransformInfo *Vtti):
323   TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
324 
325   /// Returns the most profitable vectorization factor for the loop that is
326   /// smaller or equal to the VF argument. This method checks every power
327   /// of two up to VF.
328   unsigned findBestVectorizationFactor(unsigned VF = 8);
329 
330 private:
331   /// Returns the expected execution cost. The unit of the cost does
332   /// not matter because we use the 'cost' units to compare different
333   /// vector widths. The cost that is returned is *not* normalized by
334   /// the factor width.
335   unsigned expectedCost(unsigned VF);
336 
337   /// Returns the execution time cost of an instruction for a given vector
338   /// width. Vector width of one means scalar.
339   unsigned getInstructionCost(Instruction *I, unsigned VF);
340 
341   /// A helper function for converting Scalar types to vector types.
342   /// If the incoming type is void, we return void. If the VF is 1, we return
343   /// the scalar type.
344   static Type* ToVectorTy(Type *Scalar, unsigned VF);
345 
346   /// The loop that we evaluate.
347   Loop *TheLoop;
348   /// Scev analysis.
349   ScalarEvolution *SE;
350 
351   /// Vectorization legality.
352   LoopVectorizationLegality *Legal;
353   /// Vector target information.
354   const VectorTargetTransformInfo *VTTI;
355 };
356 
357 struct LoopVectorize : public LoopPass {
358   static char ID; // Pass identification, replacement for typeid
359 
360   LoopVectorize() : LoopPass(ID) {
361     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
362   }
363 
364   ScalarEvolution *SE;
365   DataLayout *DL;
366   LoopInfo *LI;
367   TargetTransformInfo *TTI;
368   DominatorTree *DT;
369 
370   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
371     // We only vectorize innermost loops.
372     if (!L->empty())
373       return false;
374 
375     SE = &getAnalysis<ScalarEvolution>();
376     DL = getAnalysisIfAvailable<DataLayout>();
377     LI = &getAnalysis<LoopInfo>();
378     TTI = getAnalysisIfAvailable<TargetTransformInfo>();
379     DT = &getAnalysis<DominatorTree>();
380 
381     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
382           L->getHeader()->getParent()->getName() << "\"\n");
383 
384     // Check if it is legal to vectorize the loop.
385     LoopVectorizationLegality LVL(L, SE, DL);
386     if (!LVL.canVectorize()) {
387       DEBUG(dbgs() << "LV: Not vectorizing.\n");
388       return false;
389     }
390 
391     // Select the preffered vectorization factor.
392     unsigned VF = 1;
393     if (VectorizationFactor == 0) {
394       const VectorTargetTransformInfo *VTTI = 0;
395       if (TTI)
396         VTTI = TTI->getVectorTargetTransformInfo();
397       // Use the cost model.
398       LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
399       VF = CM.findBestVectorizationFactor();
400 
401       if (VF == 1) {
402         DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
403         return false;
404       }
405 
406     } else {
407       // Use the user command flag.
408       VF = VectorizationFactor;
409     }
410 
411     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
412           L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
413           "\n");
414 
415     // If we decided that it is *legal* to vectorizer the loop then do it.
416     SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
417     LB.vectorize(&LVL);
418 
419     DEBUG(verifyFunction(*L->getHeader()->getParent()));
420     return true;
421   }
422 
423   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
424     LoopPass::getAnalysisUsage(AU);
425     AU.addRequiredID(LoopSimplifyID);
426     AU.addRequiredID(LCSSAID);
427     AU.addRequired<LoopInfo>();
428     AU.addRequired<ScalarEvolution>();
429     AU.addRequired<DominatorTree>();
430     AU.addPreserved<LoopInfo>();
431     AU.addPreserved<DominatorTree>();
432   }
433 
434 };
435 
436 Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
437   // Instructions that access the old induction variable
438   // actually want to get the new one.
439   if (V == OldInduction)
440     V = Induction;
441   // Create the types.
442   LLVMContext &C = V->getContext();
443   Type *VTy = VectorType::get(V->getType(), VF);
444   Type *I32 = IntegerType::getInt32Ty(C);
445   Constant *Zero = ConstantInt::get(I32, 0);
446   Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
447   Value *UndefVal = UndefValue::get(VTy);
448   // Insert the value into a new vector.
449   Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
450   // Broadcast the scalar into all locations in the vector.
451   Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
452                                              "broadcast");
453   // We are accessing the induction variable. Make sure to promote the
454   // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
455   if (V == Induction)
456     return getConsecutiveVector(Shuf);
457   return Shuf;
458 }
459 
460 Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
461   assert(Val->getType()->isVectorTy() && "Must be a vector");
462   assert(Val->getType()->getScalarType()->isIntegerTy() &&
463          "Elem must be an integer");
464   // Create the types.
465   Type *ITy = Val->getType()->getScalarType();
466   VectorType *Ty = cast<VectorType>(Val->getType());
467   unsigned VLen = Ty->getNumElements();
468   SmallVector<Constant*, 8> Indices;
469 
470   // Create a vector of consecutive numbers from zero to VF.
471   for (unsigned i = 0; i < VLen; ++i)
472     Indices.push_back(ConstantInt::get(ITy, i));
473 
474   // Add the consecutive indices to the vector value.
475   Constant *Cv = ConstantVector::get(Indices);
476   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
477   return Builder.CreateAdd(Val, Cv, "induction");
478 }
479 
480 bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
481   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
482   if (!Gep)
483     return false;
484 
485   unsigned NumOperands = Gep->getNumOperands();
486   Value *LastIndex = Gep->getOperand(NumOperands - 1);
487 
488   // Check that all of the gep indices are uniform except for the last.
489   for (unsigned i = 0; i < NumOperands - 1; ++i)
490     if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
491       return false;
492 
493   // We can emit wide load/stores only of the last index is the induction
494   // variable.
495   const SCEV *Last = SE->getSCEV(LastIndex);
496   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
497     const SCEV *Step = AR->getStepRecurrence(*SE);
498 
499     // The memory is consecutive because the last index is consecutive
500     // and all other indices are loop invariant.
501     if (Step->isOne())
502       return true;
503   }
504 
505   return false;
506 }
507 
508 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
509   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
510   // If we saved a vectorized copy of V, use it.
511   Value *&MapEntry = WidenMap[V];
512   if (MapEntry)
513     return MapEntry;
514 
515   // Broadcast V and save the value for future uses.
516   Value *B = getBroadcastInstrs(V);
517   MapEntry = B;
518   return B;
519 }
520 
521 Constant*
522 SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
523   SmallVector<Constant*, 8> Indices;
524   // Create a vector of consecutive numbers from zero to VF.
525   for (unsigned i = 0; i < VF; ++i)
526     Indices.push_back(ConstantInt::get(ScalarTy, Val));
527 
528   // Add the consecutive indices to the vector value.
529   return ConstantVector::get(Indices);
530 }
531 
532 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
533   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
534   // Holds vector parameters or scalars, in case of uniform vals.
535   SmallVector<Value*, 8> Params;
536 
537   // Find all of the vectorized parameters.
538   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
539     Value *SrcOp = Instr->getOperand(op);
540 
541     // If we are accessing the old induction variable, use the new one.
542     if (SrcOp == OldInduction) {
543       Params.push_back(getBroadcastInstrs(Induction));
544       continue;
545     }
546 
547     // Try using previously calculated values.
548     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
549 
550     // If the src is an instruction that appeared earlier in the basic block
551     // then it should already be vectorized.
552     if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
553       assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
554       // The parameter is a vector value from earlier.
555       Params.push_back(WidenMap[SrcInst]);
556     } else {
557       // The parameter is a scalar from outside the loop. Maybe even a constant.
558       Params.push_back(SrcOp);
559     }
560   }
561 
562   assert(Params.size() == Instr->getNumOperands() &&
563          "Invalid number of operands");
564 
565   // Does this instruction return a value ?
566   bool IsVoidRetTy = Instr->getType()->isVoidTy();
567   Value *VecResults = 0;
568 
569   // If we have a return value, create an empty vector. We place the scalarized
570   // instructions in this vector.
571   if (!IsVoidRetTy)
572     VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
573 
574   // For each scalar that we create:
575   for (unsigned i = 0; i < VF; ++i) {
576     Instruction *Cloned = Instr->clone();
577     if (!IsVoidRetTy)
578       Cloned->setName(Instr->getName() + ".cloned");
579     // Replace the operands of the cloned instrucions with extracted scalars.
580     for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
581       Value *Op = Params[op];
582       // Param is a vector. Need to extract the right lane.
583       if (Op->getType()->isVectorTy())
584         Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
585       Cloned->setOperand(op, Op);
586     }
587 
588     // Place the cloned scalar in the new loop.
589     Builder.Insert(Cloned);
590 
591     // If the original scalar returns a value we need to place it in a vector
592     // so that future users will be able to use it.
593     if (!IsVoidRetTy)
594       VecResults = Builder.CreateInsertElement(VecResults, Cloned,
595                                                Builder.getInt32(i));
596   }
597 
598   if (!IsVoidRetTy)
599     WidenMap[Instr] = VecResults;
600 }
601 
602 void
603 SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
604   /*
605    In this function we generate a new loop. The new loop will contain
606    the vectorized instructions while the old loop will continue to run the
607    scalar remainder.
608 
609     [ ] <-- vector loop bypass.
610   /  |
611  /   v
612 |   [ ]     <-- vector pre header.
613 |    |
614 |    v
615 |   [  ] \
616 |   [  ]_|   <-- vector loop.
617 |    |
618  \   v
619    >[ ]   <--- middle-block.
620   /  |
621  /   v
622 |   [ ]     <--- new preheader.
623 |    |
624 |    v
625 |   [ ] \
626 |   [ ]_|   <-- old scalar loop to handle remainder.
627  \   |
628   \  v
629    >[ ]     <-- exit block.
630    ...
631    */
632 
633   // This is the original scalar-loop preheader.
634   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
635   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
636   assert(ExitBlock && "Must have an exit block");
637 
638   assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
639   assert(BypassBlock && "Invalid loop structure");
640 
641   BasicBlock *VectorPH =
642       BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
643   BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
644                                                  "vector.body");
645 
646   BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
647                                                   "middle.block");
648   BasicBlock *ScalarPH =
649     MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
650                                  "scalar.preheader");
651   // Find the induction variable.
652   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
653   OldInduction = Legal->getInduction();
654   assert(OldInduction && "We must have a single phi node.");
655   Type *IdxTy = OldInduction->getType();
656 
657   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
658   // inside the loop.
659   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
660 
661   // Generate the induction variable.
662   Induction = Builder.CreatePHI(IdxTy, 2, "index");
663   Constant *Zero = ConstantInt::get(IdxTy, 0);
664   Constant *Step = ConstantInt::get(IdxTy, VF);
665 
666   // Find the loop boundaries.
667   const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
668   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
669 
670   // Get the total trip count from the count by adding 1.
671   ExitCount = SE->getAddExpr(ExitCount,
672                              SE->getConstant(ExitCount->getType(), 1));
673 
674   // Expand the trip count and place the new instructions in the preheader.
675   // Notice that the pre-header does not change, only the loop body.
676   SCEVExpander Exp(*SE, "induction");
677   Instruction *Loc = BypassBlock->getTerminator();
678 
679   // We may need to extend the index in case there is a type mismatch.
680   // We know that the count starts at zero and does not overflow.
681   // We are using Zext because it should be less expensive.
682   if (ExitCount->getType() != Induction->getType())
683     ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
684 
685   // Count holds the overall loop count (N).
686   Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
687   // Now we need to generate the expression for N - (N % VF), which is
688   // the part that the vectorized body will execute.
689   Constant *CIVF = ConstantInt::get(IdxTy, VF);
690   Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
691   Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
692 
693   // Now, compare the new count to zero. If it is zero, jump to the scalar part.
694   Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
695                                CountRoundDown, ConstantInt::getNullValue(IdxTy),
696                                "cmp.zero", Loc);
697   BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
698   // Remove the old terminator.
699   Loc->eraseFromParent();
700 
701   // Add a check in the middle block to see if we have completed
702   // all of the iterations in the first vector loop.
703   // If (N - N%VF) == N, then we *don't* need to run the remainder.
704   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
705                                 CountRoundDown, "cmp.n",
706                                 MiddleBlock->getTerminator());
707 
708   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
709   // Remove the old terminator.
710   MiddleBlock->getTerminator()->eraseFromParent();
711 
712   // Create i+1 and fill the PHINode.
713   Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
714   Induction->addIncoming(Zero, VectorPH);
715   Induction->addIncoming(NextIdx, VecBody);
716   // Create the compare.
717   Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
718   Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
719 
720   // Now we have two terminators. Remove the old one from the block.
721   VecBody->getTerminator()->eraseFromParent();
722 
723   // Fix the scalar body iteration count.
724   unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
725   OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
726 
727   // Get ready to start creating new instructions into the vectorized body.
728   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
729 
730   // Register the new loop.
731   Loop* Lp = new Loop();
732   LPM->insertLoop(Lp, OrigLoop->getParentLoop());
733 
734   Lp->addBasicBlockToLoop(VecBody, LI->getBase());
735 
736   Loop *ParentLoop = OrigLoop->getParentLoop();
737   if (ParentLoop) {
738     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
739     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
740     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
741   }
742 
743   // Save the state.
744   LoopVectorPreHeader = VectorPH;
745   LoopScalarPreHeader = ScalarPH;
746   LoopMiddleBlock = MiddleBlock;
747   LoopExitBlock = ExitBlock;
748   LoopVectorBody = VecBody;
749   LoopScalarBody = OldBasicBlock;
750   LoopBypassBlock = BypassBlock;
751 }
752 
753 void
754 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
755   //===------------------------------------------------===//
756   //
757   // Notice: any optimization or new instruction that go
758   // into the code below should be also be implemented in
759   // the cost-model.
760   //
761   //===------------------------------------------------===//
762   typedef SmallVector<PHINode*, 4> PhiVector;
763   BasicBlock &BB = *OrigLoop->getHeader();
764   Constant *Zero = ConstantInt::get(
765     IntegerType::getInt32Ty(BB.getContext()), 0);
766 
767   // In order to support reduction variables we need to be able to vectorize
768   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
769   // steages. First, we create a new vector PHI node with no incoming edges.
770   // We use this value when we vectorize all of the instructions that use the
771   // PHI. Next, after all of the instructions in the block are complete we
772   // add the new incoming edges to the PHI. At this point all of the
773   // instructions in the basic block are vectorized, so we can use them to
774   // construct the PHI.
775   PhiVector PHIsToFix;
776 
777   // For each instruction in the old loop.
778   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
779     Instruction *Inst = it;
780 
781     switch (Inst->getOpcode()) {
782       case Instruction::Br:
783         // Nothing to do for PHIs and BR, since we already took care of the
784         // loop control flow instructions.
785         continue;
786       case Instruction::PHI:{
787         PHINode* P = cast<PHINode>(Inst);
788         // Special handling for the induction var.
789         if (OldInduction == Inst)
790           continue;
791         // This is phase one of vectorizing PHIs.
792         // This has to be a reduction variable.
793         assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
794         Type *VecTy = VectorType::get(Inst->getType(), VF);
795         WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
796         PHIsToFix.push_back(P);
797         continue;
798       }
799       case Instruction::Add:
800       case Instruction::FAdd:
801       case Instruction::Sub:
802       case Instruction::FSub:
803       case Instruction::Mul:
804       case Instruction::FMul:
805       case Instruction::UDiv:
806       case Instruction::SDiv:
807       case Instruction::FDiv:
808       case Instruction::URem:
809       case Instruction::SRem:
810       case Instruction::FRem:
811       case Instruction::Shl:
812       case Instruction::LShr:
813       case Instruction::AShr:
814       case Instruction::And:
815       case Instruction::Or:
816       case Instruction::Xor: {
817         // Just widen binops.
818         BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
819         Value *A = getVectorValue(Inst->getOperand(0));
820         Value *B = getVectorValue(Inst->getOperand(1));
821         // Use this vector value for all users of the original instruction.
822         WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
823         break;
824       }
825       case Instruction::Select: {
826         // Widen selects.
827         // If the selector is loop invariant we can create a select
828         // instruction with a scalar condition. Otherwise, use vector-select.
829         Value *Cond = Inst->getOperand(0);
830         bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
831 
832         // The condition can be loop invariant  but still defined inside the
833         // loop. This means that we can't just use the original 'cond' value.
834         // We have to take the 'vectorized' value and pick the first lane.
835         // Instcombine will make this a no-op.
836         Cond = getVectorValue(Cond);
837         if (InvariantCond)
838           Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
839 
840         Value *Op0 = getVectorValue(Inst->getOperand(1));
841         Value *Op1 = getVectorValue(Inst->getOperand(2));
842         WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
843         break;
844       }
845 
846       case Instruction::ICmp:
847       case Instruction::FCmp: {
848         // Widen compares. Generate vector compares.
849         bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
850         CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
851         Value *A = getVectorValue(Inst->getOperand(0));
852         Value *B = getVectorValue(Inst->getOperand(1));
853         if (FCmp)
854           WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
855         else
856           WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
857         break;
858       }
859 
860       case Instruction::Store: {
861         // Attempt to issue a wide store.
862         StoreInst *SI = dyn_cast<StoreInst>(Inst);
863         Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
864         Value *Ptr = SI->getPointerOperand();
865         unsigned Alignment = SI->getAlignment();
866         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
867         // This store does not use GEPs.
868         if (!Legal->isConsecutiveGep(Gep)) {
869           scalarizeInstruction(Inst);
870           break;
871         }
872 
873         // The last index does not have to be the induction. It can be
874         // consecutive and be a function of the index. For example A[I+1];
875         unsigned NumOperands = Gep->getNumOperands();
876         Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
877         LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
878 
879         // Create the new GEP with the new induction variable.
880         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
881         Gep2->setOperand(NumOperands - 1, LastIndex);
882         Ptr = Builder.Insert(Gep2);
883         Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
884         Value *Val = getVectorValue(SI->getValueOperand());
885         Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
886         break;
887       }
888       case Instruction::Load: {
889         // Attempt to issue a wide load.
890         LoadInst *LI = dyn_cast<LoadInst>(Inst);
891         Type *RetTy = VectorType::get(LI->getType(), VF);
892         Value *Ptr = LI->getPointerOperand();
893         unsigned Alignment = LI->getAlignment();
894         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
895 
896         // We don't have a gep. Scalarize the load.
897         if (!Legal->isConsecutiveGep(Gep)) {
898           scalarizeInstruction(Inst);
899           break;
900         }
901 
902         // The last index does not have to be the induction. It can be
903         // consecutive and be a function of the index. For example A[I+1];
904         unsigned NumOperands = Gep->getNumOperands();
905         Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
906         LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
907 
908         // Create the new GEP with the new induction variable.
909         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
910         Gep2->setOperand(NumOperands - 1, LastIndex);
911         Ptr = Builder.Insert(Gep2);
912         Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
913         LI = Builder.CreateLoad(Ptr);
914         LI->setAlignment(Alignment);
915         // Use this vector value for all users of the load.
916         WidenMap[Inst] = LI;
917         break;
918       }
919       case Instruction::ZExt:
920       case Instruction::SExt:
921       case Instruction::FPToUI:
922       case Instruction::FPToSI:
923       case Instruction::FPExt:
924       case Instruction::PtrToInt:
925       case Instruction::IntToPtr:
926       case Instruction::SIToFP:
927       case Instruction::UIToFP:
928       case Instruction::Trunc:
929       case Instruction::FPTrunc:
930       case Instruction::BitCast: {
931         /// Vectorize bitcasts.
932         CastInst *CI = dyn_cast<CastInst>(Inst);
933         Value *A = getVectorValue(Inst->getOperand(0));
934         Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
935         WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
936         break;
937       }
938 
939       default:
940         /// All other instructions are unsupported. Scalarize them.
941         scalarizeInstruction(Inst);
942         break;
943     }// end of switch.
944   }// end of for_each instr.
945 
946   // At this point every instruction in the original loop is widended to
947   // a vector form. We are almost done. Now, we need to fix the PHI nodes
948   // that we vectorized. The PHI nodes are currently empty because we did
949   // not want to introduce cycles. Notice that the remaining PHI nodes
950   // that we need to fix are reduction variables.
951 
952   // Create the 'reduced' values for each of the induction vars.
953   // The reduced values are the vector values that we scalarize and combine
954   // after the loop is finished.
955   for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
956        it != e; ++it) {
957     PHINode *RdxPhi = *it;
958     PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
959     assert(RdxPhi && "Unable to recover vectorized PHI");
960 
961     // Find the reduction variable descriptor.
962     assert(Legal->getReductionVars()->count(RdxPhi) &&
963            "Unable to find the reduction variable");
964     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
965       (*Legal->getReductionVars())[RdxPhi];
966 
967     // We need to generate a reduction vector from the incoming scalar.
968     // To do so, we need to generate the 'identity' vector and overide
969     // one of the elements with the incoming scalar reduction. We need
970     // to do it in the vector-loop preheader.
971     Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
972 
973     // This is the vector-clone of the value that leaves the loop.
974     Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
975     Type *VecTy = VectorExit->getType();
976 
977     // Find the reduction identity variable. The value of the enum is the
978     // identity. Zero for addition. One for Multiplication.
979     unsigned IdentitySclr =  RdxDesc.Kind;
980     Constant *Identity = getUniformVector(IdentitySclr,
981                                           VecTy->getScalarType());
982 
983     // This vector is the Identity vector where the first element is the
984     // incoming scalar reduction.
985     Value *VectorStart = Builder.CreateInsertElement(Identity,
986                                                     RdxDesc.StartValue, Zero);
987 
988 
989     // Fix the vector-loop phi.
990     // We created the induction variable so we know that the
991     // preheader is the first entry.
992     BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
993 
994     // Reductions do not have to start at zero. They can start with
995     // any loop invariant values.
996     VecRdxPhi->addIncoming(VectorStart, VecPreheader);
997     unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
998     Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
999     VecRdxPhi->addIncoming(Val, LoopVectorBody);
1000 
1001     // Before each round, move the insertion point right between
1002     // the PHIs and the values we are going to write.
1003     // This allows us to write both PHINodes and the extractelement
1004     // instructions.
1005     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
1006 
1007     // This PHINode contains the vectorized reduction variable, or
1008     // the initial value vector, if we bypass the vector loop.
1009     PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
1010     NewPhi->addIncoming(VectorStart, LoopBypassBlock);
1011     NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
1012 
1013     // Extract the first scalar.
1014     Value *Scalar0 =
1015       Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
1016     // Extract and reduce the remaining vector elements.
1017     for (unsigned i=1; i < VF; ++i) {
1018       Value *Scalar1 =
1019         Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
1020       switch (RdxDesc.Kind) {
1021         case LoopVectorizationLegality::IntegerAdd:
1022           Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
1023           break;
1024         case LoopVectorizationLegality::IntegerMult:
1025           Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
1026           break;
1027         case LoopVectorizationLegality::IntegerOr:
1028           Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
1029           break;
1030         case LoopVectorizationLegality::IntegerAnd:
1031           Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
1032           break;
1033         case LoopVectorizationLegality::IntegerXor:
1034           Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
1035           break;
1036         default:
1037           llvm_unreachable("Unknown reduction operation");
1038       }
1039     }
1040 
1041     // Now, we need to fix the users of the reduction variable
1042     // inside and outside of the scalar remainder loop.
1043     // We know that the loop is in LCSSA form. We need to update the
1044     // PHI nodes in the exit blocks.
1045     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
1046          LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
1047       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
1048       if (!LCSSAPhi) continue;
1049 
1050       // All PHINodes need to have a single entry edge, or two if
1051       // we already fixed them.
1052       assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
1053 
1054       // We found our reduction value exit-PHI. Update it with the
1055       // incoming bypass edge.
1056       if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
1057         // Add an edge coming from the bypass.
1058         LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
1059         break;
1060       }
1061     }// end of the LCSSA phi scan.
1062 
1063     // Fix the scalar loop reduction variable with the incoming reduction sum
1064     // from the vector body and from the backedge value.
1065     int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
1066     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
1067     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
1068     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
1069   }// end of for each redux variable.
1070 }
1071 
1072 void SingleBlockLoopVectorizer::updateAnalysis() {
1073   // The original basic block.
1074   SE->forgetLoop(OrigLoop);
1075 
1076   // Update the dominator tree information.
1077   assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
1078          "Entry does not dominate exit.");
1079 
1080   DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
1081   DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
1082   DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
1083   DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
1084   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
1085   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
1086 
1087   DEBUG(DT->verifyAnalysis());
1088 }
1089 
1090 bool LoopVectorizationLegality::canVectorize() {
1091   if (!TheLoop->getLoopPreheader()) {
1092     assert(false && "No preheader!!");
1093     DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
1094     return  false;
1095   }
1096 
1097   // We can only vectorize single basic block loops.
1098   unsigned NumBlocks = TheLoop->getNumBlocks();
1099   if (NumBlocks != 1) {
1100     DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
1101     return false;
1102   }
1103 
1104   // We need to have a loop header.
1105   BasicBlock *BB = TheLoop->getHeader();
1106   DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
1107 
1108   // Go over each instruction and look at memory deps.
1109   if (!canVectorizeBlock(*BB)) {
1110     DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
1111     return false;
1112   }
1113 
1114   // ScalarEvolution needs to be able to find the exit count.
1115   const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
1116   if (ExitCount == SE->getCouldNotCompute()) {
1117     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
1118     return false;
1119   }
1120 
1121   DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
1122 
1123   // Okay! We can vectorize. At this point we don't have any other mem analysis
1124   // which may limit our maximum vectorization factor, so just return true with
1125   // no restrictions.
1126   return true;
1127 }
1128 
1129 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
1130   // Scan the instructions in the block and look for hazards.
1131   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
1132     Instruction *I = it;
1133 
1134     PHINode *Phi = dyn_cast<PHINode>(I);
1135     if (Phi) {
1136       // This should not happen because the loop should be normalized.
1137       if (Phi->getNumIncomingValues() != 2) {
1138         DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
1139         return false;
1140       }
1141       // We only look at integer phi nodes.
1142       if (!Phi->getType()->isIntegerTy()) {
1143         DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
1144         return false;
1145       }
1146 
1147       if (isInductionVariable(Phi)) {
1148         if (Induction) {
1149           DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
1150           return false;
1151         }
1152         DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
1153         Induction = Phi;
1154         continue;
1155       }
1156       if (AddReductionVar(Phi, IntegerAdd)) {
1157         DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
1158         continue;
1159       }
1160       if (AddReductionVar(Phi, IntegerMult)) {
1161         DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
1162         continue;
1163       }
1164       if (AddReductionVar(Phi, IntegerOr)) {
1165         DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
1166         continue;
1167       }
1168       if (AddReductionVar(Phi, IntegerAnd)) {
1169         DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
1170         continue;
1171       }
1172       if (AddReductionVar(Phi, IntegerXor)) {
1173         DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
1174         continue;
1175       }
1176 
1177       DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
1178       return false;
1179     }// end of PHI handling
1180 
1181     // We still don't handle functions.
1182     CallInst *CI = dyn_cast<CallInst>(I);
1183     if (CI) {
1184       DEBUG(dbgs() << "LV: Found a call site.\n");
1185       return false;
1186     }
1187 
1188     // We do not re-vectorize vectors.
1189     if (!VectorType::isValidElementType(I->getType()) &&
1190         !I->getType()->isVoidTy()) {
1191       DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
1192       return false;
1193     }
1194 
1195     // Reduction instructions are allowed to have exit users.
1196     // All other instructions must not have external users.
1197     if (!AllowedExit.count(I))
1198       //Check that all of the users of the loop are inside the BB.
1199       for (Value::use_iterator it = I->use_begin(), e = I->use_end();
1200            it != e; ++it) {
1201         Instruction *U = cast<Instruction>(*it);
1202         // This user may be a reduction exit value.
1203         BasicBlock *Parent = U->getParent();
1204         if (Parent != &BB) {
1205           DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
1206           return false;
1207         }
1208     }
1209   } // next instr.
1210 
1211   if (!Induction) {
1212       DEBUG(dbgs() << "LV: Did not find an induction var.\n");
1213       return false;
1214   }
1215 
1216   // Don't vectorize if the memory dependencies do not allow vectorization.
1217   if (!canVectorizeMemory(BB))
1218     return false;
1219 
1220   // We now know that the loop is vectorizable!
1221   // Collect variables that will remain uniform after vectorization.
1222   std::vector<Value*> Worklist;
1223 
1224   // Start with the conditional branch and walk up the block.
1225   Worklist.push_back(BB.getTerminator()->getOperand(0));
1226 
1227   while (Worklist.size()) {
1228     Instruction *I = dyn_cast<Instruction>(Worklist.back());
1229     Worklist.pop_back();
1230     // Look at instructions inside this block.
1231     if (!I) continue;
1232     if (I->getParent() != &BB) continue;
1233 
1234     // Stop when reaching PHI nodes.
1235     if (isa<PHINode>(I)) {
1236       assert(I == Induction && "Found a uniform PHI that is not the induction");
1237       break;
1238     }
1239 
1240     // This is a known uniform.
1241     Uniforms.insert(I);
1242 
1243     // Insert all operands.
1244     for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
1245       Worklist.push_back(I->getOperand(i));
1246     }
1247   }
1248 
1249   return true;
1250 }
1251 
1252 bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
1253   typedef SmallVector<Value*, 16> ValueVector;
1254   typedef SmallPtrSet<Value*, 16> ValueSet;
1255   // Holds the Load and Store *instructions*.
1256   ValueVector Loads;
1257   ValueVector Stores;
1258 
1259   // Scan the BB and collect legal loads and stores.
1260   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
1261     Instruction *I = it;
1262 
1263     // If this is a load, save it. If this instruction can read from memory
1264     // but is not a load, then we quit. Notice that we don't handle function
1265     // calls that read or write.
1266     if (I->mayReadFromMemory()) {
1267       LoadInst *Ld = dyn_cast<LoadInst>(I);
1268       if (!Ld) return false;
1269       if (!Ld->isSimple()) {
1270         DEBUG(dbgs() << "LV: Found a non-simple load.\n");
1271         return false;
1272       }
1273       Loads.push_back(Ld);
1274       continue;
1275     }
1276 
1277     // Save store instructions. Abort if other instructions write to memory.
1278     if (I->mayWriteToMemory()) {
1279       StoreInst *St = dyn_cast<StoreInst>(I);
1280       if (!St) return false;
1281       if (!St->isSimple()) {
1282         DEBUG(dbgs() << "LV: Found a non-simple store.\n");
1283         return false;
1284       }
1285       Stores.push_back(St);
1286     }
1287   } // next instr.
1288 
1289   // Now we have two lists that hold the loads and the stores.
1290   // Next, we find the pointers that they use.
1291 
1292   // Check if we see any stores. If there are no stores, then we don't
1293   // care if the pointers are *restrict*.
1294   if (!Stores.size()) {
1295         DEBUG(dbgs() << "LV: Found a read-only loop!\n");
1296         return true;
1297   }
1298 
1299   // Holds the read and read-write *pointers* that we find.
1300   ValueVector Reads;
1301   ValueVector ReadWrites;
1302 
1303   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
1304   // multiple times on the same object. If the ptr is accessed twice, once
1305   // for read and once for write, it will only appear once (on the write
1306   // list). This is okay, since we are going to check for conflicts between
1307   // writes and between reads and writes, but not between reads and reads.
1308   ValueSet Seen;
1309 
1310   ValueVector::iterator I, IE;
1311   for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
1312     StoreInst *ST = dyn_cast<StoreInst>(*I);
1313     assert(ST && "Bad StoreInst");
1314     Value* Ptr = ST->getPointerOperand();
1315     // If we did *not* see this pointer before, insert it to
1316     // the read-write list. At this phase it is only a 'write' list.
1317     if (Seen.insert(Ptr))
1318       ReadWrites.push_back(Ptr);
1319   }
1320 
1321   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
1322     LoadInst *LD = dyn_cast<LoadInst>(*I);
1323     assert(LD && "Bad LoadInst");
1324     Value* Ptr = LD->getPointerOperand();
1325     // If we did *not* see this pointer before, insert it to the
1326     // read list. If we *did* see it before, then it is already in
1327     // the read-write list. This allows us to vectorize expressions
1328     // such as A[i] += x;  Because the address of A[i] is a read-write
1329     // pointer. This only works if the index of A[i] is consecutive.
1330     // If the address of i is unknown (for example A[B[i]]) then we may
1331     // read a few words, modify, and write a few words, and some of the
1332     // words may be written to the same address.
1333     if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
1334       Reads.push_back(Ptr);
1335   }
1336 
1337   // Now that the pointers are in two lists (Reads and ReadWrites), we
1338   // can check that there are no conflicts between each of the writes and
1339   // between the writes to the reads.
1340   ValueSet WriteObjects;
1341   ValueVector TempObjects;
1342 
1343   // Check that the read-writes do not conflict with other read-write
1344   // pointers.
1345   for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
1346     GetUnderlyingObjects(*I, TempObjects, DL);
1347     for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
1348          it != e; ++it) {
1349       if (!isIdentifiedObject(*it)) {
1350         DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
1351         return false;
1352       }
1353       if (!WriteObjects.insert(*it)) {
1354         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
1355               << **it <<"\n");
1356         return false;
1357       }
1358     }
1359     TempObjects.clear();
1360   }
1361 
1362   /// Check that the reads don't conflict with the read-writes.
1363   for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
1364     GetUnderlyingObjects(*I, TempObjects, DL);
1365     for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
1366          it != e; ++it) {
1367       if (!isIdentifiedObject(*it)) {
1368         DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
1369         return false;
1370       }
1371       if (WriteObjects.count(*it)) {
1372         DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
1373               << **it <<"\n");
1374         return false;
1375       }
1376     }
1377     TempObjects.clear();
1378   }
1379 
1380   // All is okay.
1381   return true;
1382 }
1383 
1384 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
1385                                                 ReductionKind Kind) {
1386   if (Phi->getNumIncomingValues() != 2)
1387     return false;
1388 
1389   // Find the possible incoming reduction variable.
1390   BasicBlock *BB = Phi->getParent();
1391   int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
1392   int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
1393   Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
1394 
1395   // ExitInstruction is the single value which is used outside the loop.
1396   // We only allow for a single reduction value to be used outside the loop.
1397   // This includes users of the reduction, variables (which form a cycle
1398   // which ends in the phi node).
1399   Instruction *ExitInstruction = 0;
1400 
1401   // Iter is our iterator. We start with the PHI node and scan for all of the
1402   // users of this instruction. All users must be instructions which can be
1403   // used as reduction variables (such as ADD). We may have a single
1404   // out-of-block user. They cycle must end with the original PHI.
1405   // Also, we can't have multiple block-local users.
1406   Instruction *Iter = Phi;
1407   while (true) {
1408     // Any reduction instr must be of one of the allowed kinds.
1409     if (!isReductionInstr(Iter, Kind))
1410       return false;
1411 
1412     // Did we found a user inside this block ?
1413     bool FoundInBlockUser = false;
1414     // Did we reach the initial PHI node ?
1415     bool FoundStartPHI = false;
1416 
1417     // If the instruction has no users then this is a broken
1418     // chain and can't be a reduction variable.
1419     if (Iter->use_empty())
1420       return false;
1421 
1422     // For each of the *users* of iter.
1423     for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
1424          it != e; ++it) {
1425       Instruction *U = cast<Instruction>(*it);
1426       // We already know that the PHI is a user.
1427       if (U == Phi) {
1428         FoundStartPHI = true;
1429         continue;
1430       }
1431       // Check if we found the exit user.
1432       BasicBlock *Parent = U->getParent();
1433       if (Parent != BB) {
1434         // We must have a single exit instruction.
1435         if (ExitInstruction != 0)
1436           return false;
1437         ExitInstruction = Iter;
1438       }
1439       // We can't have multiple inside users.
1440       if (FoundInBlockUser)
1441         return false;
1442       FoundInBlockUser = true;
1443       Iter = U;
1444     }
1445 
1446     // We found a reduction var if we have reached the original
1447     // phi node and we only have a single instruction with out-of-loop
1448     // users.
1449    if (FoundStartPHI && ExitInstruction) {
1450      // This instruction is allowed to have out-of-loop users.
1451      AllowedExit.insert(ExitInstruction);
1452 
1453      // Save the description of this reduction variable.
1454      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
1455      Reductions[Phi] = RD;
1456      return true;
1457    }
1458   }
1459 }
1460 
1461 bool
1462 LoopVectorizationLegality::isReductionInstr(Instruction *I,
1463                                             ReductionKind Kind) {
1464     switch (I->getOpcode()) {
1465     default:
1466       return false;
1467     case Instruction::PHI:
1468       // possibly.
1469       return true;
1470     case Instruction::Add:
1471     case Instruction::Sub:
1472       return Kind == IntegerAdd;
1473     case Instruction::Mul:
1474     case Instruction::UDiv:
1475     case Instruction::SDiv:
1476       return Kind == IntegerMult;
1477     case Instruction::And:
1478       return Kind == IntegerAnd;
1479     case Instruction::Or:
1480       return Kind == IntegerOr;
1481     case Instruction::Xor:
1482       return Kind == IntegerXor;
1483     }
1484 }
1485 
1486 bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
1487   // Check that the PHI is consecutive and starts at zero.
1488   const SCEV *PhiScev = SE->getSCEV(Phi);
1489   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
1490   if (!AR) {
1491     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
1492     return false;
1493   }
1494   const SCEV *Step = AR->getStepRecurrence(*SE);
1495   const SCEV *Start = AR->getStart();
1496 
1497   if (!Step->isOne() || !Start->isZero()) {
1498     DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
1499     return false;
1500   }
1501   return true;
1502 }
1503 
1504 unsigned
1505 LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
1506   if (!VTTI) {
1507     DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
1508     return 1;
1509   }
1510 
1511   float Cost = expectedCost(1);
1512   unsigned Width = 1;
1513   DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
1514   for (unsigned i=2; i <= VF; i*=2) {
1515     // Notice that the vector loop needs to be executed less times, so
1516     // we need to divide the cost of the vector loops by the width of
1517     // the vector elements.
1518     float VectorCost = expectedCost(i) / (float)i;
1519     DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
1520           (int)VectorCost << ".\n");
1521     if (VectorCost < Cost) {
1522       Cost = VectorCost;
1523       Width = i;
1524     }
1525   }
1526 
1527   DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
1528   return Width;
1529 }
1530 
1531 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
1532   // We can only estimate the cost of single basic block loops.
1533   assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
1534 
1535   BasicBlock *BB = TheLoop->getHeader();
1536   unsigned Cost = 0;
1537 
1538   // For each instruction in the old loop.
1539   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
1540     Instruction *Inst = it;
1541     unsigned C = getInstructionCost(Inst, VF);
1542     Cost += C;
1543     DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
1544           " For instruction: "<< *Inst << "\n");
1545   }
1546 
1547   return Cost;
1548 }
1549 
1550 unsigned
1551 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
1552   assert(VTTI && "Invalid vector target transformation info");
1553 
1554   // If we know that this instruction will remain uniform, check the cost of
1555   // the scalar version.
1556   if (Legal->isUniformAfterVectorization(I))
1557     VF = 1;
1558 
1559   Type *RetTy = I->getType();
1560   Type *VectorTy = ToVectorTy(RetTy, VF);
1561 
1562 
1563   // TODO: We need to estimate the cost of intrinsic calls.
1564   switch (I->getOpcode()) {
1565     case Instruction::GetElementPtr:
1566       // We mark this instruction as zero-cost because scalar GEPs are usually
1567       // lowered to the intruction addressing mode. At the moment we don't
1568       // generate vector geps.
1569       return 0;
1570     case Instruction::Br: {
1571       return VTTI->getCFInstrCost(I->getOpcode());
1572     }
1573     case Instruction::PHI:
1574       return 0;
1575     case Instruction::Add:
1576     case Instruction::FAdd:
1577     case Instruction::Sub:
1578     case Instruction::FSub:
1579     case Instruction::Mul:
1580     case Instruction::FMul:
1581     case Instruction::UDiv:
1582     case Instruction::SDiv:
1583     case Instruction::FDiv:
1584     case Instruction::URem:
1585     case Instruction::SRem:
1586     case Instruction::FRem:
1587     case Instruction::Shl:
1588     case Instruction::LShr:
1589     case Instruction::AShr:
1590     case Instruction::And:
1591     case Instruction::Or:
1592     case Instruction::Xor: {
1593       return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
1594     }
1595     case Instruction::Select: {
1596       SelectInst *SI = cast<SelectInst>(I);
1597       const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
1598       bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
1599       Type *CondTy = SI->getCondition()->getType();
1600       if (ScalarCond)
1601         CondTy = VectorType::get(CondTy, VF);
1602 
1603       return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
1604     }
1605     case Instruction::ICmp:
1606     case Instruction::FCmp: {
1607       Type *ValTy = I->getOperand(0)->getType();
1608       VectorTy = ToVectorTy(ValTy, VF);
1609       return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
1610     }
1611     case Instruction::Store: {
1612       StoreInst *SI = cast<StoreInst>(I);
1613       Type *ValTy = SI->getValueOperand()->getType();
1614       VectorTy = ToVectorTy(ValTy, VF);
1615 
1616       if (VF == 1)
1617         return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
1618                               SI->getAlignment(), SI->getPointerAddressSpace());
1619 
1620       // Scalarized stores.
1621       if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
1622         unsigned Cost = 0;
1623         unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
1624                                               ValTy);
1625         // The cost of extracting from the value vector.
1626         Cost += VF * (ExtCost);
1627         // The cost of the scalar stores.
1628         Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
1629                                            ValTy->getScalarType(),
1630                                            SI->getAlignment(),
1631                                            SI->getPointerAddressSpace());
1632         return Cost;
1633       }
1634 
1635       // Wide stores.
1636       return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
1637                                    SI->getPointerAddressSpace());
1638     }
1639     case Instruction::Load: {
1640       LoadInst *LI = cast<LoadInst>(I);
1641 
1642       if (VF == 1)
1643         return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
1644                                      LI->getAlignment(),
1645                                      LI->getPointerAddressSpace());
1646 
1647       // Scalarized loads.
1648       if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
1649         unsigned Cost = 0;
1650         unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
1651         // The cost of inserting the loaded value into the result vector.
1652         Cost += VF * (InCost);
1653         // The cost of the scalar stores.
1654         Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
1655                                            RetTy->getScalarType(),
1656                                            LI->getAlignment(),
1657                                            LI->getPointerAddressSpace());
1658         return Cost;
1659       }
1660 
1661       // Wide loads.
1662       return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
1663                                    LI->getPointerAddressSpace());
1664     }
1665     case Instruction::ZExt:
1666     case Instruction::SExt:
1667     case Instruction::FPToUI:
1668     case Instruction::FPToSI:
1669     case Instruction::FPExt:
1670     case Instruction::PtrToInt:
1671     case Instruction::IntToPtr:
1672     case Instruction::SIToFP:
1673     case Instruction::UIToFP:
1674     case Instruction::Trunc:
1675     case Instruction::FPTrunc:
1676     case Instruction::BitCast: {
1677       Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
1678       return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
1679     }
1680     default: {
1681       // We are scalarizing the instruction. Return the cost of the scalar
1682       // instruction, plus the cost of insert and extract into vector
1683       // elements, times the vector width.
1684       unsigned Cost = 0;
1685 
1686       bool IsVoid = RetTy->isVoidTy();
1687 
1688       unsigned InsCost = (IsVoid ? 0 :
1689                           VTTI->getInstrCost(Instruction::InsertElement,
1690                                              VectorTy));
1691 
1692       unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
1693                                             VectorTy);
1694 
1695       // The cost of inserting the results plus extracting each one of the
1696       // operands.
1697       Cost += VF * (InsCost + ExtCost * I->getNumOperands());
1698 
1699       // The cost of executing VF copies of the scalar instruction.
1700       Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
1701       return Cost;
1702     }
1703   }// end of switch.
1704 }
1705 
1706 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
1707   if (Scalar->isVoidTy() || VF == 1)
1708     return Scalar;
1709   return VectorType::get(Scalar, VF);
1710 }
1711 
1712 } // namespace
1713 
1714 char LoopVectorize::ID = 0;
1715 static const char lv_name[] = "Loop Vectorization";
1716 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
1717 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
1718 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
1719 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
1720 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
1721 
1722 namespace llvm {
1723   Pass *createLoopVectorizePass() {
1724     return new LoopVectorize();
1725   }
1726 }
1727 
1728