1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11 // and generates target-independent LLVM-IR. Legalization of the IR is done 12 // in the codegen. However, the vectorizes uses (will use) the codegen 13 // interfaces to generate IR that is likely to result in an optimal binary. 14 // 15 // The loop vectorizer combines consecutive loop iteration into a single 16 // 'wide' iteration. After this transformation the index is incremented 17 // by the SIMD vector width, and not by one. 18 // 19 // This pass has three parts: 20 // 1. The main loop pass that drives the different parts. 21 // 2. LoopVectorizationLegality - A unit that checks for the legality 22 // of the vectorization. 23 // 3. SingleBlockLoopVectorizer - A unit that performs the actual 24 // widening of instructions. 25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 26 // of vectorization. It decides on the optimal vector width, which 27 // can be one, if vectorization is not profitable. 28 //===----------------------------------------------------------------------===// 29 // 30 // The reduction-variable vectorization is based on the paper: 31 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 32 // 33 // Variable uniformity checks are inspired by: 34 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 35 // 36 // Other ideas/concepts are from: 37 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 38 // 39 //===----------------------------------------------------------------------===// 40 #define LV_NAME "loop-vectorize" 41 #define DEBUG_TYPE LV_NAME 42 #include "llvm/Constants.h" 43 #include "llvm/DerivedTypes.h" 44 #include "llvm/Instructions.h" 45 #include "llvm/LLVMContext.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Analysis/LoopPass.h" 48 #include "llvm/Value.h" 49 #include "llvm/Function.h" 50 #include "llvm/Analysis/Verifier.h" 51 #include "llvm/Module.h" 52 #include "llvm/Type.h" 53 #include "llvm/ADT/SmallVector.h" 54 #include "llvm/ADT/StringExtras.h" 55 #include "llvm/Analysis/AliasAnalysis.h" 56 #include "llvm/Analysis/AliasSetTracker.h" 57 #include "llvm/Analysis/ScalarEvolution.h" 58 #include "llvm/Analysis/Dominators.h" 59 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 60 #include "llvm/Analysis/ScalarEvolutionExpander.h" 61 #include "llvm/Analysis/LoopInfo.h" 62 #include "llvm/Analysis/ValueTracking.h" 63 #include "llvm/Transforms/Scalar.h" 64 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 65 #include "llvm/TargetTransformInfo.h" 66 #include "llvm/Support/CommandLine.h" 67 #include "llvm/Support/Debug.h" 68 #include "llvm/Support/raw_ostream.h" 69 #include "llvm/DataLayout.h" 70 #include "llvm/Transforms/Utils/Local.h" 71 #include <algorithm> 72 using namespace llvm; 73 74 static cl::opt<unsigned> 75 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 76 cl::desc("Set the default vectorization width. Zero is autoselect.")); 77 78 namespace { 79 80 // Forward declarations. 81 class LoopVectorizationLegality; 82 class LoopVectorizationCostModel; 83 84 /// SingleBlockLoopVectorizer vectorizes loops which contain only one basic 85 /// block to a specified vectorization factor (VF). 86 /// This class performs the widening of scalars into vectors, or multiple 87 /// scalars. This class also implements the following features: 88 /// * It inserts an epilogue loop for handling loops that don't have iteration 89 /// counts that are known to be a multiple of the vectorization factor. 90 /// * It handles the code generation for reduction variables. 91 /// * Scalarization (implementation using scalars) of un-vectorizable 92 /// instructions. 93 /// SingleBlockLoopVectorizer does not perform any vectorization-legality 94 /// checks, and relies on the caller to check for the different legality 95 /// aspects. The SingleBlockLoopVectorizer relies on the 96 /// LoopVectorizationLegality class to provide information about the induction 97 /// and reduction variables that were found to a given vectorization factor. 98 class SingleBlockLoopVectorizer { 99 public: 100 /// Ctor. 101 SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, 102 DominatorTree *dt, LPPassManager *Lpm, 103 unsigned VecWidth): 104 OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth), 105 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 106 107 // Perform the actual loop widening (vectorization). 108 void vectorize(LoopVectorizationLegality *Legal) { 109 ///Create a new empty loop. Unlink the old loop and connect the new one. 110 createEmptyLoop(Legal); 111 /// Widen each instruction in the old loop to a new one in the new loop. 112 /// Use the Legality module to find the induction and reduction variables. 113 vectorizeLoop(Legal); 114 // register the new loop. 115 updateAnalysis(); 116 } 117 118 private: 119 /// Create an empty loop, based on the loop ranges of the old loop. 120 void createEmptyLoop(LoopVectorizationLegality *Legal); 121 /// Copy and widen the instructions from the old loop. 122 void vectorizeLoop(LoopVectorizationLegality *Legal); 123 /// Insert the new loop to the loop hierarchy and pass manager. 124 void updateAnalysis(); 125 126 /// This instruction is un-vectorizable. Implement it as a sequence 127 /// of scalars. 128 void scalarizeInstruction(Instruction *Instr); 129 130 /// Create a broadcast instruction. This method generates a broadcast 131 /// instruction (shuffle) for loop invariant values and for the induction 132 /// value. If this is the induction variable then we extend it to N, N+1, ... 133 /// this is needed because each iteration in the loop corresponds to a SIMD 134 /// element. 135 Value *getBroadcastInstrs(Value *V); 136 137 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 138 /// for each element in the vector. Starting from zero. 139 Value *getConsecutiveVector(Value* Val); 140 141 /// When we go over instructions in the basic block we rely on previous 142 /// values within the current basic block or on loop invariant values. 143 /// When we widen (vectorize) values we place them in the map. If the values 144 /// are not within the map, they have to be loop invariant, so we simply 145 /// broadcast them into a vector. 146 Value *getVectorValue(Value *V); 147 148 /// Get a uniform vector of constant integers. We use this to get 149 /// vectors of ones and zeros for the reduction code. 150 Constant* getUniformVector(unsigned Val, Type* ScalarTy); 151 152 typedef DenseMap<Value*, Value*> ValueMap; 153 154 /// The original loop. 155 Loop *OrigLoop; 156 // Scev analysis to use. 157 ScalarEvolution *SE; 158 // Loop Info. 159 LoopInfo *LI; 160 // Dominator Tree. 161 DominatorTree *DT; 162 // Loop Pass Manager; 163 LPPassManager *LPM; 164 // The vectorization factor to use. 165 unsigned VF; 166 167 // The builder that we use 168 IRBuilder<> Builder; 169 170 // --- Vectorization state --- 171 172 /// The vector-loop preheader. 173 BasicBlock *LoopVectorPreHeader; 174 /// The scalar-loop preheader. 175 BasicBlock *LoopScalarPreHeader; 176 /// Middle Block between the vector and the scalar. 177 BasicBlock *LoopMiddleBlock; 178 ///The ExitBlock of the scalar loop. 179 BasicBlock *LoopExitBlock; 180 ///The vector loop body. 181 BasicBlock *LoopVectorBody; 182 ///The scalar loop body. 183 BasicBlock *LoopScalarBody; 184 ///The first bypass block. 185 BasicBlock *LoopBypassBlock; 186 187 /// The new Induction variable which was added to the new block. 188 PHINode *Induction; 189 /// The induction variable of the old basic block. 190 PHINode *OldInduction; 191 // Maps scalars to widened vectors. 192 ValueMap WidenMap; 193 }; 194 195 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 196 /// to what vectorization factor. 197 /// This class does not look at the profitability of vectorization, only the 198 /// legality. This class has two main kinds of checks: 199 /// * Memory checks - The code in canVectorizeMemory checks if vectorization 200 /// will change the order of memory accesses in a way that will change the 201 /// correctness of the program. 202 /// * Scalars checks - The code in canVectorizeBlock checks for a number 203 /// of different conditions, such as the availability of a single induction 204 /// variable, that all types are supported and vectorize-able, etc. 205 /// This code reflects the capabilities of SingleBlockLoopVectorizer. 206 /// This class is also used by SingleBlockLoopVectorizer for identifying 207 /// induction variable and the different reduction variables. 208 class LoopVectorizationLegality { 209 public: 210 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): 211 TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } 212 213 /// This represents the kinds of reductions that we support. 214 /// We use the enum values to hold the 'identity' value for 215 /// each operand. This value does not change the result if applied. 216 enum ReductionKind { 217 NoReduction = -1, /// Not a reduction. 218 IntegerAdd = 0, /// Sum of numbers. 219 IntegerMult = 1, /// Product of numbers. 220 IntegerOr = 2, /// Bitwise or logical OR of numbers. 221 IntegerAnd = 3, /// Bitwise or logical AND of numbers. 222 IntegerXor = 4 /// Bitwise or logical XOR of numbers. 223 }; 224 225 /// This POD struct holds information about reduction variables. 226 struct ReductionDescriptor { 227 // Default C'tor 228 ReductionDescriptor(): 229 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} 230 231 // C'tor. 232 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): 233 StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 234 235 // The starting value of the reduction. 236 // It does not have to be zero! 237 Value *StartValue; 238 // The instruction who's value is used outside the loop. 239 Instruction *LoopExitInstr; 240 // The kind of the reduction. 241 ReductionKind Kind; 242 }; 243 244 /// ReductionList contains the reduction descriptors for all 245 /// of the reductions that were found in the loop. 246 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 247 248 /// Returns true if it is legal to vectorize this loop. 249 /// This does not mean that it is profitable to vectorize this 250 /// loop, only that it is legal to do so. 251 bool canVectorize(); 252 253 /// Returns the Induction variable. 254 PHINode *getInduction() {return Induction;} 255 256 /// Returns the reduction variables found in the loop. 257 ReductionList *getReductionVars() { return &Reductions; } 258 259 /// Check if the pointer returned by this GEP is consecutive 260 /// when the index is vectorized. This happens when the last 261 /// index of the GEP is consecutive, like the induction variable. 262 /// This check allows us to vectorize A[idx] into a wide load/store. 263 bool isConsecutiveGep(Value *Ptr); 264 265 /// Returns true if this instruction will remain scalar after vectorization. 266 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} 267 268 private: 269 /// Check if a single basic block loop is vectorizable. 270 /// At this point we know that this is a loop with a constant trip count 271 /// and we only need to check individual instructions. 272 bool canVectorizeBlock(BasicBlock &BB); 273 274 /// When we vectorize loops we may change the order in which 275 /// we read and write from memory. This method checks if it is 276 /// legal to vectorize the code, considering only memory constrains. 277 /// Returns true if BB is vectorizable 278 bool canVectorizeMemory(BasicBlock &BB); 279 280 /// Returns True, if 'Phi' is the kind of reduction variable for type 281 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 282 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 283 /// Returns true if the instruction I can be a reduction variable of type 284 /// 'Kind'. 285 bool isReductionInstr(Instruction *I, ReductionKind Kind); 286 /// Returns True, if 'Phi' is an induction variable. 287 bool isInductionVariable(PHINode *Phi); 288 289 /// The loop that we evaluate. 290 Loop *TheLoop; 291 /// Scev analysis. 292 ScalarEvolution *SE; 293 /// DataLayout analysis. 294 DataLayout *DL; 295 296 // --- vectorization state --- // 297 298 /// Holds the induction variable. 299 PHINode *Induction; 300 /// Holds the reduction variables. 301 ReductionList Reductions; 302 /// Allowed outside users. This holds the reduction 303 /// vars which can be accessed from outside the loop. 304 SmallPtrSet<Value*, 4> AllowedExit; 305 /// This set holds the variables which are known to be uniform after 306 /// vectorization. 307 SmallPtrSet<Instruction*, 4> Uniforms; 308 }; 309 310 /// LoopVectorizationCostModel - estimates the expected speedups due to 311 /// vectorization. 312 /// In many cases vectorization is not profitable. This can happen because 313 /// of a number of reasons. In this class we mainly attempt to predict 314 /// the expected speedup/slowdowns due to the supported instruction set. 315 /// We use the VectorTargetTransformInfo to query the different backends 316 /// for the cost of different operations. 317 class LoopVectorizationCostModel { 318 public: 319 /// C'tor. 320 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, 321 LoopVectorizationLegality *Leg, 322 const VectorTargetTransformInfo *Vtti): 323 TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } 324 325 /// Returns the most profitable vectorization factor for the loop that is 326 /// smaller or equal to the VF argument. This method checks every power 327 /// of two up to VF. 328 unsigned findBestVectorizationFactor(unsigned VF = 8); 329 330 private: 331 /// Returns the expected execution cost. The unit of the cost does 332 /// not matter because we use the 'cost' units to compare different 333 /// vector widths. The cost that is returned is *not* normalized by 334 /// the factor width. 335 unsigned expectedCost(unsigned VF); 336 337 /// Returns the execution time cost of an instruction for a given vector 338 /// width. Vector width of one means scalar. 339 unsigned getInstructionCost(Instruction *I, unsigned VF); 340 341 /// A helper function for converting Scalar types to vector types. 342 /// If the incoming type is void, we return void. If the VF is 1, we return 343 /// the scalar type. 344 static Type* ToVectorTy(Type *Scalar, unsigned VF); 345 346 /// The loop that we evaluate. 347 Loop *TheLoop; 348 /// Scev analysis. 349 ScalarEvolution *SE; 350 351 /// Vectorization legality. 352 LoopVectorizationLegality *Legal; 353 /// Vector target information. 354 const VectorTargetTransformInfo *VTTI; 355 }; 356 357 struct LoopVectorize : public LoopPass { 358 static char ID; // Pass identification, replacement for typeid 359 360 LoopVectorize() : LoopPass(ID) { 361 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 362 } 363 364 ScalarEvolution *SE; 365 DataLayout *DL; 366 LoopInfo *LI; 367 TargetTransformInfo *TTI; 368 DominatorTree *DT; 369 370 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 371 // We only vectorize innermost loops. 372 if (!L->empty()) 373 return false; 374 375 SE = &getAnalysis<ScalarEvolution>(); 376 DL = getAnalysisIfAvailable<DataLayout>(); 377 LI = &getAnalysis<LoopInfo>(); 378 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 379 DT = &getAnalysis<DominatorTree>(); 380 381 DEBUG(dbgs() << "LV: Checking a loop in \"" << 382 L->getHeader()->getParent()->getName() << "\"\n"); 383 384 // Check if it is legal to vectorize the loop. 385 LoopVectorizationLegality LVL(L, SE, DL); 386 if (!LVL.canVectorize()) { 387 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 388 return false; 389 } 390 391 // Select the preffered vectorization factor. 392 unsigned VF = 1; 393 if (VectorizationFactor == 0) { 394 const VectorTargetTransformInfo *VTTI = 0; 395 if (TTI) 396 VTTI = TTI->getVectorTargetTransformInfo(); 397 // Use the cost model. 398 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 399 VF = CM.findBestVectorizationFactor(); 400 401 if (VF == 1) { 402 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 403 return false; 404 } 405 406 } else { 407 // Use the user command flag. 408 VF = VectorizationFactor; 409 } 410 411 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 412 L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< 413 "\n"); 414 415 // If we decided that it is *legal* to vectorizer the loop then do it. 416 SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF); 417 LB.vectorize(&LVL); 418 419 DEBUG(verifyFunction(*L->getHeader()->getParent())); 420 return true; 421 } 422 423 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 424 LoopPass::getAnalysisUsage(AU); 425 AU.addRequiredID(LoopSimplifyID); 426 AU.addRequiredID(LCSSAID); 427 AU.addRequired<LoopInfo>(); 428 AU.addRequired<ScalarEvolution>(); 429 AU.addRequired<DominatorTree>(); 430 AU.addPreserved<LoopInfo>(); 431 AU.addPreserved<DominatorTree>(); 432 } 433 434 }; 435 436 Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { 437 // Instructions that access the old induction variable 438 // actually want to get the new one. 439 if (V == OldInduction) 440 V = Induction; 441 // Create the types. 442 LLVMContext &C = V->getContext(); 443 Type *VTy = VectorType::get(V->getType(), VF); 444 Type *I32 = IntegerType::getInt32Ty(C); 445 Constant *Zero = ConstantInt::get(I32, 0); 446 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 447 Value *UndefVal = UndefValue::get(VTy); 448 // Insert the value into a new vector. 449 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 450 // Broadcast the scalar into all locations in the vector. 451 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 452 "broadcast"); 453 // We are accessing the induction variable. Make sure to promote the 454 // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. 455 if (V == Induction) 456 return getConsecutiveVector(Shuf); 457 return Shuf; 458 } 459 460 Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { 461 assert(Val->getType()->isVectorTy() && "Must be a vector"); 462 assert(Val->getType()->getScalarType()->isIntegerTy() && 463 "Elem must be an integer"); 464 // Create the types. 465 Type *ITy = Val->getType()->getScalarType(); 466 VectorType *Ty = cast<VectorType>(Val->getType()); 467 unsigned VLen = Ty->getNumElements(); 468 SmallVector<Constant*, 8> Indices; 469 470 // Create a vector of consecutive numbers from zero to VF. 471 for (unsigned i = 0; i < VLen; ++i) 472 Indices.push_back(ConstantInt::get(ITy, i)); 473 474 // Add the consecutive indices to the vector value. 475 Constant *Cv = ConstantVector::get(Indices); 476 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 477 return Builder.CreateAdd(Val, Cv, "induction"); 478 } 479 480 bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { 481 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 482 if (!Gep) 483 return false; 484 485 unsigned NumOperands = Gep->getNumOperands(); 486 Value *LastIndex = Gep->getOperand(NumOperands - 1); 487 488 // Check that all of the gep indices are uniform except for the last. 489 for (unsigned i = 0; i < NumOperands - 1; ++i) 490 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 491 return false; 492 493 // We can emit wide load/stores only of the last index is the induction 494 // variable. 495 const SCEV *Last = SE->getSCEV(LastIndex); 496 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 497 const SCEV *Step = AR->getStepRecurrence(*SE); 498 499 // The memory is consecutive because the last index is consecutive 500 // and all other indices are loop invariant. 501 if (Step->isOne()) 502 return true; 503 } 504 505 return false; 506 } 507 508 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { 509 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 510 // If we saved a vectorized copy of V, use it. 511 Value *&MapEntry = WidenMap[V]; 512 if (MapEntry) 513 return MapEntry; 514 515 // Broadcast V and save the value for future uses. 516 Value *B = getBroadcastInstrs(V); 517 MapEntry = B; 518 return B; 519 } 520 521 Constant* 522 SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 523 SmallVector<Constant*, 8> Indices; 524 // Create a vector of consecutive numbers from zero to VF. 525 for (unsigned i = 0; i < VF; ++i) 526 Indices.push_back(ConstantInt::get(ScalarTy, Val)); 527 528 // Add the consecutive indices to the vector value. 529 return ConstantVector::get(Indices); 530 } 531 532 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 533 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 534 // Holds vector parameters or scalars, in case of uniform vals. 535 SmallVector<Value*, 8> Params; 536 537 // Find all of the vectorized parameters. 538 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 539 Value *SrcOp = Instr->getOperand(op); 540 541 // If we are accessing the old induction variable, use the new one. 542 if (SrcOp == OldInduction) { 543 Params.push_back(getBroadcastInstrs(Induction)); 544 continue; 545 } 546 547 // Try using previously calculated values. 548 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 549 550 // If the src is an instruction that appeared earlier in the basic block 551 // then it should already be vectorized. 552 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 553 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 554 // The parameter is a vector value from earlier. 555 Params.push_back(WidenMap[SrcInst]); 556 } else { 557 // The parameter is a scalar from outside the loop. Maybe even a constant. 558 Params.push_back(SrcOp); 559 } 560 } 561 562 assert(Params.size() == Instr->getNumOperands() && 563 "Invalid number of operands"); 564 565 // Does this instruction return a value ? 566 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 567 Value *VecResults = 0; 568 569 // If we have a return value, create an empty vector. We place the scalarized 570 // instructions in this vector. 571 if (!IsVoidRetTy) 572 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 573 574 // For each scalar that we create: 575 for (unsigned i = 0; i < VF; ++i) { 576 Instruction *Cloned = Instr->clone(); 577 if (!IsVoidRetTy) 578 Cloned->setName(Instr->getName() + ".cloned"); 579 // Replace the operands of the cloned instrucions with extracted scalars. 580 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 581 Value *Op = Params[op]; 582 // Param is a vector. Need to extract the right lane. 583 if (Op->getType()->isVectorTy()) 584 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 585 Cloned->setOperand(op, Op); 586 } 587 588 // Place the cloned scalar in the new loop. 589 Builder.Insert(Cloned); 590 591 // If the original scalar returns a value we need to place it in a vector 592 // so that future users will be able to use it. 593 if (!IsVoidRetTy) 594 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 595 Builder.getInt32(i)); 596 } 597 598 if (!IsVoidRetTy) 599 WidenMap[Instr] = VecResults; 600 } 601 602 void 603 SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 604 /* 605 In this function we generate a new loop. The new loop will contain 606 the vectorized instructions while the old loop will continue to run the 607 scalar remainder. 608 609 [ ] <-- vector loop bypass. 610 / | 611 / v 612 | [ ] <-- vector pre header. 613 | | 614 | v 615 | [ ] \ 616 | [ ]_| <-- vector loop. 617 | | 618 \ v 619 >[ ] <--- middle-block. 620 / | 621 / v 622 | [ ] <--- new preheader. 623 | | 624 | v 625 | [ ] \ 626 | [ ]_| <-- old scalar loop to handle remainder. 627 \ | 628 \ v 629 >[ ] <-- exit block. 630 ... 631 */ 632 633 // This is the original scalar-loop preheader. 634 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 635 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 636 assert(ExitBlock && "Must have an exit block"); 637 638 assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); 639 assert(BypassBlock && "Invalid loop structure"); 640 641 BasicBlock *VectorPH = 642 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 643 BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), 644 "vector.body"); 645 646 BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), 647 "middle.block"); 648 BasicBlock *ScalarPH = 649 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), 650 "scalar.preheader"); 651 // Find the induction variable. 652 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 653 OldInduction = Legal->getInduction(); 654 assert(OldInduction && "We must have a single phi node."); 655 Type *IdxTy = OldInduction->getType(); 656 657 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 658 // inside the loop. 659 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 660 661 // Generate the induction variable. 662 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 663 Constant *Zero = ConstantInt::get(IdxTy, 0); 664 Constant *Step = ConstantInt::get(IdxTy, VF); 665 666 // Find the loop boundaries. 667 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 668 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 669 670 // Get the total trip count from the count by adding 1. 671 ExitCount = SE->getAddExpr(ExitCount, 672 SE->getConstant(ExitCount->getType(), 1)); 673 674 // Expand the trip count and place the new instructions in the preheader. 675 // Notice that the pre-header does not change, only the loop body. 676 SCEVExpander Exp(*SE, "induction"); 677 Instruction *Loc = BypassBlock->getTerminator(); 678 679 // We may need to extend the index in case there is a type mismatch. 680 // We know that the count starts at zero and does not overflow. 681 // We are using Zext because it should be less expensive. 682 if (ExitCount->getType() != Induction->getType()) 683 ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); 684 685 // Count holds the overall loop count (N). 686 Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); 687 // Now we need to generate the expression for N - (N % VF), which is 688 // the part that the vectorized body will execute. 689 Constant *CIVF = ConstantInt::get(IdxTy, VF); 690 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 691 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 692 693 // Now, compare the new count to zero. If it is zero, jump to the scalar part. 694 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 695 CountRoundDown, ConstantInt::getNullValue(IdxTy), 696 "cmp.zero", Loc); 697 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 698 // Remove the old terminator. 699 Loc->eraseFromParent(); 700 701 // Add a check in the middle block to see if we have completed 702 // all of the iterations in the first vector loop. 703 // If (N - N%VF) == N, then we *don't* need to run the remainder. 704 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 705 CountRoundDown, "cmp.n", 706 MiddleBlock->getTerminator()); 707 708 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 709 // Remove the old terminator. 710 MiddleBlock->getTerminator()->eraseFromParent(); 711 712 // Create i+1 and fill the PHINode. 713 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 714 Induction->addIncoming(Zero, VectorPH); 715 Induction->addIncoming(NextIdx, VecBody); 716 // Create the compare. 717 Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); 718 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 719 720 // Now we have two terminators. Remove the old one from the block. 721 VecBody->getTerminator()->eraseFromParent(); 722 723 // Fix the scalar body iteration count. 724 unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); 725 OldInduction->setIncomingValue(BlockIdx, CountRoundDown); 726 727 // Get ready to start creating new instructions into the vectorized body. 728 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 729 730 // Register the new loop. 731 Loop* Lp = new Loop(); 732 LPM->insertLoop(Lp, OrigLoop->getParentLoop()); 733 734 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 735 736 Loop *ParentLoop = OrigLoop->getParentLoop(); 737 if (ParentLoop) { 738 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 739 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 740 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 741 } 742 743 // Save the state. 744 LoopVectorPreHeader = VectorPH; 745 LoopScalarPreHeader = ScalarPH; 746 LoopMiddleBlock = MiddleBlock; 747 LoopExitBlock = ExitBlock; 748 LoopVectorBody = VecBody; 749 LoopScalarBody = OldBasicBlock; 750 LoopBypassBlock = BypassBlock; 751 } 752 753 void 754 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 755 //===------------------------------------------------===// 756 // 757 // Notice: any optimization or new instruction that go 758 // into the code below should be also be implemented in 759 // the cost-model. 760 // 761 //===------------------------------------------------===// 762 typedef SmallVector<PHINode*, 4> PhiVector; 763 BasicBlock &BB = *OrigLoop->getHeader(); 764 Constant *Zero = ConstantInt::get( 765 IntegerType::getInt32Ty(BB.getContext()), 0); 766 767 // In order to support reduction variables we need to be able to vectorize 768 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 769 // steages. First, we create a new vector PHI node with no incoming edges. 770 // We use this value when we vectorize all of the instructions that use the 771 // PHI. Next, after all of the instructions in the block are complete we 772 // add the new incoming edges to the PHI. At this point all of the 773 // instructions in the basic block are vectorized, so we can use them to 774 // construct the PHI. 775 PhiVector PHIsToFix; 776 777 // For each instruction in the old loop. 778 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 779 Instruction *Inst = it; 780 781 switch (Inst->getOpcode()) { 782 case Instruction::Br: 783 // Nothing to do for PHIs and BR, since we already took care of the 784 // loop control flow instructions. 785 continue; 786 case Instruction::PHI:{ 787 PHINode* P = cast<PHINode>(Inst); 788 // Special handling for the induction var. 789 if (OldInduction == Inst) 790 continue; 791 // This is phase one of vectorizing PHIs. 792 // This has to be a reduction variable. 793 assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); 794 Type *VecTy = VectorType::get(Inst->getType(), VF); 795 WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); 796 PHIsToFix.push_back(P); 797 continue; 798 } 799 case Instruction::Add: 800 case Instruction::FAdd: 801 case Instruction::Sub: 802 case Instruction::FSub: 803 case Instruction::Mul: 804 case Instruction::FMul: 805 case Instruction::UDiv: 806 case Instruction::SDiv: 807 case Instruction::FDiv: 808 case Instruction::URem: 809 case Instruction::SRem: 810 case Instruction::FRem: 811 case Instruction::Shl: 812 case Instruction::LShr: 813 case Instruction::AShr: 814 case Instruction::And: 815 case Instruction::Or: 816 case Instruction::Xor: { 817 // Just widen binops. 818 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 819 Value *A = getVectorValue(Inst->getOperand(0)); 820 Value *B = getVectorValue(Inst->getOperand(1)); 821 // Use this vector value for all users of the original instruction. 822 WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 823 break; 824 } 825 case Instruction::Select: { 826 // Widen selects. 827 // If the selector is loop invariant we can create a select 828 // instruction with a scalar condition. Otherwise, use vector-select. 829 Value *Cond = Inst->getOperand(0); 830 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 831 832 // The condition can be loop invariant but still defined inside the 833 // loop. This means that we can't just use the original 'cond' value. 834 // We have to take the 'vectorized' value and pick the first lane. 835 // Instcombine will make this a no-op. 836 Cond = getVectorValue(Cond); 837 if (InvariantCond) 838 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 839 840 Value *Op0 = getVectorValue(Inst->getOperand(1)); 841 Value *Op1 = getVectorValue(Inst->getOperand(2)); 842 WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); 843 break; 844 } 845 846 case Instruction::ICmp: 847 case Instruction::FCmp: { 848 // Widen compares. Generate vector compares. 849 bool FCmp = (Inst->getOpcode() == Instruction::FCmp); 850 CmpInst *Cmp = dyn_cast<CmpInst>(Inst); 851 Value *A = getVectorValue(Inst->getOperand(0)); 852 Value *B = getVectorValue(Inst->getOperand(1)); 853 if (FCmp) 854 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 855 else 856 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 857 break; 858 } 859 860 case Instruction::Store: { 861 // Attempt to issue a wide store. 862 StoreInst *SI = dyn_cast<StoreInst>(Inst); 863 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 864 Value *Ptr = SI->getPointerOperand(); 865 unsigned Alignment = SI->getAlignment(); 866 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 867 // This store does not use GEPs. 868 if (!Legal->isConsecutiveGep(Gep)) { 869 scalarizeInstruction(Inst); 870 break; 871 } 872 873 // The last index does not have to be the induction. It can be 874 // consecutive and be a function of the index. For example A[I+1]; 875 unsigned NumOperands = Gep->getNumOperands(); 876 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 877 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 878 879 // Create the new GEP with the new induction variable. 880 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 881 Gep2->setOperand(NumOperands - 1, LastIndex); 882 Ptr = Builder.Insert(Gep2); 883 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 884 Value *Val = getVectorValue(SI->getValueOperand()); 885 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 886 break; 887 } 888 case Instruction::Load: { 889 // Attempt to issue a wide load. 890 LoadInst *LI = dyn_cast<LoadInst>(Inst); 891 Type *RetTy = VectorType::get(LI->getType(), VF); 892 Value *Ptr = LI->getPointerOperand(); 893 unsigned Alignment = LI->getAlignment(); 894 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 895 896 // We don't have a gep. Scalarize the load. 897 if (!Legal->isConsecutiveGep(Gep)) { 898 scalarizeInstruction(Inst); 899 break; 900 } 901 902 // The last index does not have to be the induction. It can be 903 // consecutive and be a function of the index. For example A[I+1]; 904 unsigned NumOperands = Gep->getNumOperands(); 905 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 906 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 907 908 // Create the new GEP with the new induction variable. 909 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 910 Gep2->setOperand(NumOperands - 1, LastIndex); 911 Ptr = Builder.Insert(Gep2); 912 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 913 LI = Builder.CreateLoad(Ptr); 914 LI->setAlignment(Alignment); 915 // Use this vector value for all users of the load. 916 WidenMap[Inst] = LI; 917 break; 918 } 919 case Instruction::ZExt: 920 case Instruction::SExt: 921 case Instruction::FPToUI: 922 case Instruction::FPToSI: 923 case Instruction::FPExt: 924 case Instruction::PtrToInt: 925 case Instruction::IntToPtr: 926 case Instruction::SIToFP: 927 case Instruction::UIToFP: 928 case Instruction::Trunc: 929 case Instruction::FPTrunc: 930 case Instruction::BitCast: { 931 /// Vectorize bitcasts. 932 CastInst *CI = dyn_cast<CastInst>(Inst); 933 Value *A = getVectorValue(Inst->getOperand(0)); 934 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 935 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 936 break; 937 } 938 939 default: 940 /// All other instructions are unsupported. Scalarize them. 941 scalarizeInstruction(Inst); 942 break; 943 }// end of switch. 944 }// end of for_each instr. 945 946 // At this point every instruction in the original loop is widended to 947 // a vector form. We are almost done. Now, we need to fix the PHI nodes 948 // that we vectorized. The PHI nodes are currently empty because we did 949 // not want to introduce cycles. Notice that the remaining PHI nodes 950 // that we need to fix are reduction variables. 951 952 // Create the 'reduced' values for each of the induction vars. 953 // The reduced values are the vector values that we scalarize and combine 954 // after the loop is finished. 955 for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); 956 it != e; ++it) { 957 PHINode *RdxPhi = *it; 958 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 959 assert(RdxPhi && "Unable to recover vectorized PHI"); 960 961 // Find the reduction variable descriptor. 962 assert(Legal->getReductionVars()->count(RdxPhi) && 963 "Unable to find the reduction variable"); 964 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 965 (*Legal->getReductionVars())[RdxPhi]; 966 967 // We need to generate a reduction vector from the incoming scalar. 968 // To do so, we need to generate the 'identity' vector and overide 969 // one of the elements with the incoming scalar reduction. We need 970 // to do it in the vector-loop preheader. 971 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 972 973 // This is the vector-clone of the value that leaves the loop. 974 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 975 Type *VecTy = VectorExit->getType(); 976 977 // Find the reduction identity variable. The value of the enum is the 978 // identity. Zero for addition. One for Multiplication. 979 unsigned IdentitySclr = RdxDesc.Kind; 980 Constant *Identity = getUniformVector(IdentitySclr, 981 VecTy->getScalarType()); 982 983 // This vector is the Identity vector where the first element is the 984 // incoming scalar reduction. 985 Value *VectorStart = Builder.CreateInsertElement(Identity, 986 RdxDesc.StartValue, Zero); 987 988 989 // Fix the vector-loop phi. 990 // We created the induction variable so we know that the 991 // preheader is the first entry. 992 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 993 994 // Reductions do not have to start at zero. They can start with 995 // any loop invariant values. 996 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 997 unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 998 Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); 999 VecRdxPhi->addIncoming(Val, LoopVectorBody); 1000 1001 // Before each round, move the insertion point right between 1002 // the PHIs and the values we are going to write. 1003 // This allows us to write both PHINodes and the extractelement 1004 // instructions. 1005 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1006 1007 // This PHINode contains the vectorized reduction variable, or 1008 // the initial value vector, if we bypass the vector loop. 1009 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1010 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 1011 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 1012 1013 // Extract the first scalar. 1014 Value *Scalar0 = 1015 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); 1016 // Extract and reduce the remaining vector elements. 1017 for (unsigned i=1; i < VF; ++i) { 1018 Value *Scalar1 = 1019 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); 1020 switch (RdxDesc.Kind) { 1021 case LoopVectorizationLegality::IntegerAdd: 1022 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); 1023 break; 1024 case LoopVectorizationLegality::IntegerMult: 1025 Scalar0 = Builder.CreateMul(Scalar0, Scalar1); 1026 break; 1027 case LoopVectorizationLegality::IntegerOr: 1028 Scalar0 = Builder.CreateOr(Scalar0, Scalar1); 1029 break; 1030 case LoopVectorizationLegality::IntegerAnd: 1031 Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); 1032 break; 1033 case LoopVectorizationLegality::IntegerXor: 1034 Scalar0 = Builder.CreateXor(Scalar0, Scalar1); 1035 break; 1036 default: 1037 llvm_unreachable("Unknown reduction operation"); 1038 } 1039 } 1040 1041 // Now, we need to fix the users of the reduction variable 1042 // inside and outside of the scalar remainder loop. 1043 // We know that the loop is in LCSSA form. We need to update the 1044 // PHI nodes in the exit blocks. 1045 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1046 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1047 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1048 if (!LCSSAPhi) continue; 1049 1050 // All PHINodes need to have a single entry edge, or two if 1051 // we already fixed them. 1052 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1053 1054 // We found our reduction value exit-PHI. Update it with the 1055 // incoming bypass edge. 1056 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1057 // Add an edge coming from the bypass. 1058 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1059 break; 1060 } 1061 }// end of the LCSSA phi scan. 1062 1063 // Fix the scalar loop reduction variable with the incoming reduction sum 1064 // from the vector body and from the backedge value. 1065 int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1066 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. 1067 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1068 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1069 }// end of for each redux variable. 1070 } 1071 1072 void SingleBlockLoopVectorizer::updateAnalysis() { 1073 // The original basic block. 1074 SE->forgetLoop(OrigLoop); 1075 1076 // Update the dominator tree information. 1077 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1078 "Entry does not dominate exit."); 1079 1080 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1081 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1082 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1083 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1084 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1085 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1086 1087 DEBUG(DT->verifyAnalysis()); 1088 } 1089 1090 bool LoopVectorizationLegality::canVectorize() { 1091 if (!TheLoop->getLoopPreheader()) { 1092 assert(false && "No preheader!!"); 1093 DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); 1094 return false; 1095 } 1096 1097 // We can only vectorize single basic block loops. 1098 unsigned NumBlocks = TheLoop->getNumBlocks(); 1099 if (NumBlocks != 1) { 1100 DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); 1101 return false; 1102 } 1103 1104 // We need to have a loop header. 1105 BasicBlock *BB = TheLoop->getHeader(); 1106 DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); 1107 1108 // Go over each instruction and look at memory deps. 1109 if (!canVectorizeBlock(*BB)) { 1110 DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); 1111 return false; 1112 } 1113 1114 // ScalarEvolution needs to be able to find the exit count. 1115 const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); 1116 if (ExitCount == SE->getCouldNotCompute()) { 1117 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1118 return false; 1119 } 1120 1121 DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); 1122 1123 // Okay! We can vectorize. At this point we don't have any other mem analysis 1124 // which may limit our maximum vectorization factor, so just return true with 1125 // no restrictions. 1126 return true; 1127 } 1128 1129 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { 1130 // Scan the instructions in the block and look for hazards. 1131 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1132 Instruction *I = it; 1133 1134 PHINode *Phi = dyn_cast<PHINode>(I); 1135 if (Phi) { 1136 // This should not happen because the loop should be normalized. 1137 if (Phi->getNumIncomingValues() != 2) { 1138 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1139 return false; 1140 } 1141 // We only look at integer phi nodes. 1142 if (!Phi->getType()->isIntegerTy()) { 1143 DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); 1144 return false; 1145 } 1146 1147 if (isInductionVariable(Phi)) { 1148 if (Induction) { 1149 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1150 return false; 1151 } 1152 DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); 1153 Induction = Phi; 1154 continue; 1155 } 1156 if (AddReductionVar(Phi, IntegerAdd)) { 1157 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1158 continue; 1159 } 1160 if (AddReductionVar(Phi, IntegerMult)) { 1161 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1162 continue; 1163 } 1164 if (AddReductionVar(Phi, IntegerOr)) { 1165 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1166 continue; 1167 } 1168 if (AddReductionVar(Phi, IntegerAnd)) { 1169 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1170 continue; 1171 } 1172 if (AddReductionVar(Phi, IntegerXor)) { 1173 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1174 continue; 1175 } 1176 1177 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1178 return false; 1179 }// end of PHI handling 1180 1181 // We still don't handle functions. 1182 CallInst *CI = dyn_cast<CallInst>(I); 1183 if (CI) { 1184 DEBUG(dbgs() << "LV: Found a call site.\n"); 1185 return false; 1186 } 1187 1188 // We do not re-vectorize vectors. 1189 if (!VectorType::isValidElementType(I->getType()) && 1190 !I->getType()->isVoidTy()) { 1191 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1192 return false; 1193 } 1194 1195 // Reduction instructions are allowed to have exit users. 1196 // All other instructions must not have external users. 1197 if (!AllowedExit.count(I)) 1198 //Check that all of the users of the loop are inside the BB. 1199 for (Value::use_iterator it = I->use_begin(), e = I->use_end(); 1200 it != e; ++it) { 1201 Instruction *U = cast<Instruction>(*it); 1202 // This user may be a reduction exit value. 1203 BasicBlock *Parent = U->getParent(); 1204 if (Parent != &BB) { 1205 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1206 return false; 1207 } 1208 } 1209 } // next instr. 1210 1211 if (!Induction) { 1212 DEBUG(dbgs() << "LV: Did not find an induction var.\n"); 1213 return false; 1214 } 1215 1216 // Don't vectorize if the memory dependencies do not allow vectorization. 1217 if (!canVectorizeMemory(BB)) 1218 return false; 1219 1220 // We now know that the loop is vectorizable! 1221 // Collect variables that will remain uniform after vectorization. 1222 std::vector<Value*> Worklist; 1223 1224 // Start with the conditional branch and walk up the block. 1225 Worklist.push_back(BB.getTerminator()->getOperand(0)); 1226 1227 while (Worklist.size()) { 1228 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1229 Worklist.pop_back(); 1230 // Look at instructions inside this block. 1231 if (!I) continue; 1232 if (I->getParent() != &BB) continue; 1233 1234 // Stop when reaching PHI nodes. 1235 if (isa<PHINode>(I)) { 1236 assert(I == Induction && "Found a uniform PHI that is not the induction"); 1237 break; 1238 } 1239 1240 // This is a known uniform. 1241 Uniforms.insert(I); 1242 1243 // Insert all operands. 1244 for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { 1245 Worklist.push_back(I->getOperand(i)); 1246 } 1247 } 1248 1249 return true; 1250 } 1251 1252 bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { 1253 typedef SmallVector<Value*, 16> ValueVector; 1254 typedef SmallPtrSet<Value*, 16> ValueSet; 1255 // Holds the Load and Store *instructions*. 1256 ValueVector Loads; 1257 ValueVector Stores; 1258 1259 // Scan the BB and collect legal loads and stores. 1260 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1261 Instruction *I = it; 1262 1263 // If this is a load, save it. If this instruction can read from memory 1264 // but is not a load, then we quit. Notice that we don't handle function 1265 // calls that read or write. 1266 if (I->mayReadFromMemory()) { 1267 LoadInst *Ld = dyn_cast<LoadInst>(I); 1268 if (!Ld) return false; 1269 if (!Ld->isSimple()) { 1270 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1271 return false; 1272 } 1273 Loads.push_back(Ld); 1274 continue; 1275 } 1276 1277 // Save store instructions. Abort if other instructions write to memory. 1278 if (I->mayWriteToMemory()) { 1279 StoreInst *St = dyn_cast<StoreInst>(I); 1280 if (!St) return false; 1281 if (!St->isSimple()) { 1282 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1283 return false; 1284 } 1285 Stores.push_back(St); 1286 } 1287 } // next instr. 1288 1289 // Now we have two lists that hold the loads and the stores. 1290 // Next, we find the pointers that they use. 1291 1292 // Check if we see any stores. If there are no stores, then we don't 1293 // care if the pointers are *restrict*. 1294 if (!Stores.size()) { 1295 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1296 return true; 1297 } 1298 1299 // Holds the read and read-write *pointers* that we find. 1300 ValueVector Reads; 1301 ValueVector ReadWrites; 1302 1303 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1304 // multiple times on the same object. If the ptr is accessed twice, once 1305 // for read and once for write, it will only appear once (on the write 1306 // list). This is okay, since we are going to check for conflicts between 1307 // writes and between reads and writes, but not between reads and reads. 1308 ValueSet Seen; 1309 1310 ValueVector::iterator I, IE; 1311 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1312 StoreInst *ST = dyn_cast<StoreInst>(*I); 1313 assert(ST && "Bad StoreInst"); 1314 Value* Ptr = ST->getPointerOperand(); 1315 // If we did *not* see this pointer before, insert it to 1316 // the read-write list. At this phase it is only a 'write' list. 1317 if (Seen.insert(Ptr)) 1318 ReadWrites.push_back(Ptr); 1319 } 1320 1321 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1322 LoadInst *LD = dyn_cast<LoadInst>(*I); 1323 assert(LD && "Bad LoadInst"); 1324 Value* Ptr = LD->getPointerOperand(); 1325 // If we did *not* see this pointer before, insert it to the 1326 // read list. If we *did* see it before, then it is already in 1327 // the read-write list. This allows us to vectorize expressions 1328 // such as A[i] += x; Because the address of A[i] is a read-write 1329 // pointer. This only works if the index of A[i] is consecutive. 1330 // If the address of i is unknown (for example A[B[i]]) then we may 1331 // read a few words, modify, and write a few words, and some of the 1332 // words may be written to the same address. 1333 if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr)) 1334 Reads.push_back(Ptr); 1335 } 1336 1337 // Now that the pointers are in two lists (Reads and ReadWrites), we 1338 // can check that there are no conflicts between each of the writes and 1339 // between the writes to the reads. 1340 ValueSet WriteObjects; 1341 ValueVector TempObjects; 1342 1343 // Check that the read-writes do not conflict with other read-write 1344 // pointers. 1345 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1346 GetUnderlyingObjects(*I, TempObjects, DL); 1347 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1348 it != e; ++it) { 1349 if (!isIdentifiedObject(*it)) { 1350 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1351 return false; 1352 } 1353 if (!WriteObjects.insert(*it)) { 1354 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1355 << **it <<"\n"); 1356 return false; 1357 } 1358 } 1359 TempObjects.clear(); 1360 } 1361 1362 /// Check that the reads don't conflict with the read-writes. 1363 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1364 GetUnderlyingObjects(*I, TempObjects, DL); 1365 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1366 it != e; ++it) { 1367 if (!isIdentifiedObject(*it)) { 1368 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1369 return false; 1370 } 1371 if (WriteObjects.count(*it)) { 1372 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1373 << **it <<"\n"); 1374 return false; 1375 } 1376 } 1377 TempObjects.clear(); 1378 } 1379 1380 // All is okay. 1381 return true; 1382 } 1383 1384 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1385 ReductionKind Kind) { 1386 if (Phi->getNumIncomingValues() != 2) 1387 return false; 1388 1389 // Find the possible incoming reduction variable. 1390 BasicBlock *BB = Phi->getParent(); 1391 int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); 1392 int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. 1393 Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); 1394 1395 // ExitInstruction is the single value which is used outside the loop. 1396 // We only allow for a single reduction value to be used outside the loop. 1397 // This includes users of the reduction, variables (which form a cycle 1398 // which ends in the phi node). 1399 Instruction *ExitInstruction = 0; 1400 1401 // Iter is our iterator. We start with the PHI node and scan for all of the 1402 // users of this instruction. All users must be instructions which can be 1403 // used as reduction variables (such as ADD). We may have a single 1404 // out-of-block user. They cycle must end with the original PHI. 1405 // Also, we can't have multiple block-local users. 1406 Instruction *Iter = Phi; 1407 while (true) { 1408 // Any reduction instr must be of one of the allowed kinds. 1409 if (!isReductionInstr(Iter, Kind)) 1410 return false; 1411 1412 // Did we found a user inside this block ? 1413 bool FoundInBlockUser = false; 1414 // Did we reach the initial PHI node ? 1415 bool FoundStartPHI = false; 1416 1417 // If the instruction has no users then this is a broken 1418 // chain and can't be a reduction variable. 1419 if (Iter->use_empty()) 1420 return false; 1421 1422 // For each of the *users* of iter. 1423 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1424 it != e; ++it) { 1425 Instruction *U = cast<Instruction>(*it); 1426 // We already know that the PHI is a user. 1427 if (U == Phi) { 1428 FoundStartPHI = true; 1429 continue; 1430 } 1431 // Check if we found the exit user. 1432 BasicBlock *Parent = U->getParent(); 1433 if (Parent != BB) { 1434 // We must have a single exit instruction. 1435 if (ExitInstruction != 0) 1436 return false; 1437 ExitInstruction = Iter; 1438 } 1439 // We can't have multiple inside users. 1440 if (FoundInBlockUser) 1441 return false; 1442 FoundInBlockUser = true; 1443 Iter = U; 1444 } 1445 1446 // We found a reduction var if we have reached the original 1447 // phi node and we only have a single instruction with out-of-loop 1448 // users. 1449 if (FoundStartPHI && ExitInstruction) { 1450 // This instruction is allowed to have out-of-loop users. 1451 AllowedExit.insert(ExitInstruction); 1452 1453 // Save the description of this reduction variable. 1454 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1455 Reductions[Phi] = RD; 1456 return true; 1457 } 1458 } 1459 } 1460 1461 bool 1462 LoopVectorizationLegality::isReductionInstr(Instruction *I, 1463 ReductionKind Kind) { 1464 switch (I->getOpcode()) { 1465 default: 1466 return false; 1467 case Instruction::PHI: 1468 // possibly. 1469 return true; 1470 case Instruction::Add: 1471 case Instruction::Sub: 1472 return Kind == IntegerAdd; 1473 case Instruction::Mul: 1474 case Instruction::UDiv: 1475 case Instruction::SDiv: 1476 return Kind == IntegerMult; 1477 case Instruction::And: 1478 return Kind == IntegerAnd; 1479 case Instruction::Or: 1480 return Kind == IntegerOr; 1481 case Instruction::Xor: 1482 return Kind == IntegerXor; 1483 } 1484 } 1485 1486 bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1487 // Check that the PHI is consecutive and starts at zero. 1488 const SCEV *PhiScev = SE->getSCEV(Phi); 1489 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1490 if (!AR) { 1491 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1492 return false; 1493 } 1494 const SCEV *Step = AR->getStepRecurrence(*SE); 1495 const SCEV *Start = AR->getStart(); 1496 1497 if (!Step->isOne() || !Start->isZero()) { 1498 DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); 1499 return false; 1500 } 1501 return true; 1502 } 1503 1504 unsigned 1505 LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { 1506 if (!VTTI) { 1507 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 1508 return 1; 1509 } 1510 1511 float Cost = expectedCost(1); 1512 unsigned Width = 1; 1513 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 1514 for (unsigned i=2; i <= VF; i*=2) { 1515 // Notice that the vector loop needs to be executed less times, so 1516 // we need to divide the cost of the vector loops by the width of 1517 // the vector elements. 1518 float VectorCost = expectedCost(i) / (float)i; 1519 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 1520 (int)VectorCost << ".\n"); 1521 if (VectorCost < Cost) { 1522 Cost = VectorCost; 1523 Width = i; 1524 } 1525 } 1526 1527 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 1528 return Width; 1529 } 1530 1531 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 1532 // We can only estimate the cost of single basic block loops. 1533 assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); 1534 1535 BasicBlock *BB = TheLoop->getHeader(); 1536 unsigned Cost = 0; 1537 1538 // For each instruction in the old loop. 1539 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1540 Instruction *Inst = it; 1541 unsigned C = getInstructionCost(Inst, VF); 1542 Cost += C; 1543 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << 1544 " For instruction: "<< *Inst << "\n"); 1545 } 1546 1547 return Cost; 1548 } 1549 1550 unsigned 1551 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 1552 assert(VTTI && "Invalid vector target transformation info"); 1553 1554 // If we know that this instruction will remain uniform, check the cost of 1555 // the scalar version. 1556 if (Legal->isUniformAfterVectorization(I)) 1557 VF = 1; 1558 1559 Type *RetTy = I->getType(); 1560 Type *VectorTy = ToVectorTy(RetTy, VF); 1561 1562 1563 // TODO: We need to estimate the cost of intrinsic calls. 1564 switch (I->getOpcode()) { 1565 case Instruction::GetElementPtr: 1566 // We mark this instruction as zero-cost because scalar GEPs are usually 1567 // lowered to the intruction addressing mode. At the moment we don't 1568 // generate vector geps. 1569 return 0; 1570 case Instruction::Br: { 1571 return VTTI->getCFInstrCost(I->getOpcode()); 1572 } 1573 case Instruction::PHI: 1574 return 0; 1575 case Instruction::Add: 1576 case Instruction::FAdd: 1577 case Instruction::Sub: 1578 case Instruction::FSub: 1579 case Instruction::Mul: 1580 case Instruction::FMul: 1581 case Instruction::UDiv: 1582 case Instruction::SDiv: 1583 case Instruction::FDiv: 1584 case Instruction::URem: 1585 case Instruction::SRem: 1586 case Instruction::FRem: 1587 case Instruction::Shl: 1588 case Instruction::LShr: 1589 case Instruction::AShr: 1590 case Instruction::And: 1591 case Instruction::Or: 1592 case Instruction::Xor: { 1593 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 1594 } 1595 case Instruction::Select: { 1596 SelectInst *SI = cast<SelectInst>(I); 1597 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 1598 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 1599 Type *CondTy = SI->getCondition()->getType(); 1600 if (ScalarCond) 1601 CondTy = VectorType::get(CondTy, VF); 1602 1603 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 1604 } 1605 case Instruction::ICmp: 1606 case Instruction::FCmp: { 1607 Type *ValTy = I->getOperand(0)->getType(); 1608 VectorTy = ToVectorTy(ValTy, VF); 1609 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 1610 } 1611 case Instruction::Store: { 1612 StoreInst *SI = cast<StoreInst>(I); 1613 Type *ValTy = SI->getValueOperand()->getType(); 1614 VectorTy = ToVectorTy(ValTy, VF); 1615 1616 if (VF == 1) 1617 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 1618 SI->getAlignment(), SI->getPointerAddressSpace()); 1619 1620 // Scalarized stores. 1621 if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { 1622 unsigned Cost = 0; 1623 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 1624 ValTy); 1625 // The cost of extracting from the value vector. 1626 Cost += VF * (ExtCost); 1627 // The cost of the scalar stores. 1628 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 1629 ValTy->getScalarType(), 1630 SI->getAlignment(), 1631 SI->getPointerAddressSpace()); 1632 return Cost; 1633 } 1634 1635 // Wide stores. 1636 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 1637 SI->getPointerAddressSpace()); 1638 } 1639 case Instruction::Load: { 1640 LoadInst *LI = cast<LoadInst>(I); 1641 1642 if (VF == 1) 1643 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 1644 LI->getAlignment(), 1645 LI->getPointerAddressSpace()); 1646 1647 // Scalarized loads. 1648 if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { 1649 unsigned Cost = 0; 1650 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 1651 // The cost of inserting the loaded value into the result vector. 1652 Cost += VF * (InCost); 1653 // The cost of the scalar stores. 1654 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 1655 RetTy->getScalarType(), 1656 LI->getAlignment(), 1657 LI->getPointerAddressSpace()); 1658 return Cost; 1659 } 1660 1661 // Wide loads. 1662 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 1663 LI->getPointerAddressSpace()); 1664 } 1665 case Instruction::ZExt: 1666 case Instruction::SExt: 1667 case Instruction::FPToUI: 1668 case Instruction::FPToSI: 1669 case Instruction::FPExt: 1670 case Instruction::PtrToInt: 1671 case Instruction::IntToPtr: 1672 case Instruction::SIToFP: 1673 case Instruction::UIToFP: 1674 case Instruction::Trunc: 1675 case Instruction::FPTrunc: 1676 case Instruction::BitCast: { 1677 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 1678 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 1679 } 1680 default: { 1681 // We are scalarizing the instruction. Return the cost of the scalar 1682 // instruction, plus the cost of insert and extract into vector 1683 // elements, times the vector width. 1684 unsigned Cost = 0; 1685 1686 bool IsVoid = RetTy->isVoidTy(); 1687 1688 unsigned InsCost = (IsVoid ? 0 : 1689 VTTI->getInstrCost(Instruction::InsertElement, 1690 VectorTy)); 1691 1692 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 1693 VectorTy); 1694 1695 // The cost of inserting the results plus extracting each one of the 1696 // operands. 1697 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 1698 1699 // The cost of executing VF copies of the scalar instruction. 1700 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 1701 return Cost; 1702 } 1703 }// end of switch. 1704 } 1705 1706 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 1707 if (Scalar->isVoidTy() || VF == 1) 1708 return Scalar; 1709 return VectorType::get(Scalar, VF); 1710 } 1711 1712 } // namespace 1713 1714 char LoopVectorize::ID = 0; 1715 static const char lv_name[] = "Loop Vectorization"; 1716 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 1717 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 1718 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1719 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 1720 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 1721 1722 namespace llvm { 1723 Pass *createLoopVectorizePass() { 1724 return new LoopVectorize(); 1725 } 1726 } 1727 1728