1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11 // and generates target-independent LLVM-IR. Legalization of the IR is done 12 // in the codegen. However, the vectorizes uses (will use) the codegen 13 // interfaces to generate IR that is likely to result in an optimal binary. 14 // 15 // The loop vectorizer combines consecutive loop iteration into a single 16 // 'wide' iteration. After this transformation the index is incremented 17 // by the SIMD vector width, and not by one. 18 // 19 // This pass has three parts: 20 // 1. The main loop pass that drives the different parts. 21 // 2. LoopVectorizationLegality - A unit that checks for the legality 22 // of the vectorization. 23 // 3. InnerLoopVectorizer - A unit that performs the actual 24 // widening of instructions. 25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 26 // of vectorization. It decides on the optimal vector width, which 27 // can be one, if vectorization is not profitable. 28 // 29 //===----------------------------------------------------------------------===// 30 // 31 // The reduction-variable vectorization is based on the paper: 32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33 // 34 // Variable uniformity checks are inspired by: 35 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 36 // 37 // Other ideas/concepts are from: 38 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 39 // 40 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 41 // Vectorizing Compilers. 42 // 43 //===----------------------------------------------------------------------===// 44 #define LV_NAME "loop-vectorize" 45 #define DEBUG_TYPE LV_NAME 46 #include "llvm/Transforms/Vectorize.h" 47 #include "llvm/ADT/SmallVector.h" 48 #include "llvm/ADT/StringExtras.h" 49 #include "llvm/Analysis/AliasAnalysis.h" 50 #include "llvm/Analysis/AliasSetTracker.h" 51 #include "llvm/Analysis/Dominators.h" 52 #include "llvm/Analysis/LoopInfo.h" 53 #include "llvm/Analysis/LoopIterator.h" 54 #include "llvm/Analysis/LoopPass.h" 55 #include "llvm/Analysis/ScalarEvolution.h" 56 #include "llvm/Analysis/ScalarEvolutionExpander.h" 57 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 58 #include "llvm/Analysis/ValueTracking.h" 59 #include "llvm/Analysis/Verifier.h" 60 #include "llvm/Constants.h" 61 #include "llvm/DataLayout.h" 62 #include "llvm/DerivedTypes.h" 63 #include "llvm/Function.h" 64 #include "llvm/Instructions.h" 65 #include "llvm/LLVMContext.h" 66 #include "llvm/Module.h" 67 #include "llvm/Pass.h" 68 #include "llvm/Support/CommandLine.h" 69 #include "llvm/Support/Debug.h" 70 #include "llvm/Support/raw_ostream.h" 71 #include "llvm/TargetTransformInfo.h" 72 #include "llvm/Transforms/Scalar.h" 73 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 74 #include "llvm/Transforms/Utils/Local.h" 75 #include "llvm/Type.h" 76 #include "llvm/Value.h" 77 #include <algorithm> 78 using namespace llvm; 79 80 static cl::opt<unsigned> 81 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 82 cl::desc("Set the default vectorization width. Zero is autoselect.")); 83 84 static cl::opt<bool> 85 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 86 cl::desc("Enable if-conversion during vectorization.")); 87 88 /// We don't vectorize loops with a known constant trip count below this number. 89 const unsigned TinyTripCountThreshold = 16; 90 91 /// When performing a runtime memory check, do not check more than this 92 /// number of pointers. Notice that the check is quadratic! 93 const unsigned RuntimeMemoryCheckThreshold = 4; 94 95 /// This is the highest vector width that we try to generate. 96 const unsigned MaxVectorSize = 8; 97 98 namespace { 99 100 // Forward declarations. 101 class LoopVectorizationLegality; 102 class LoopVectorizationCostModel; 103 104 /// InnerLoopVectorizer vectorizes loops which contain only one basic 105 /// block to a specified vectorization factor (VF). 106 /// This class performs the widening of scalars into vectors, or multiple 107 /// scalars. This class also implements the following features: 108 /// * It inserts an epilogue loop for handling loops that don't have iteration 109 /// counts that are known to be a multiple of the vectorization factor. 110 /// * It handles the code generation for reduction variables. 111 /// * Scalarization (implementation using scalars) of un-vectorizable 112 /// instructions. 113 /// InnerLoopVectorizer does not perform any vectorization-legality 114 /// checks, and relies on the caller to check for the different legality 115 /// aspects. The InnerLoopVectorizer relies on the 116 /// LoopVectorizationLegality class to provide information about the induction 117 /// and reduction variables that were found to a given vectorization factor. 118 class InnerLoopVectorizer { 119 public: 120 /// Ctor. 121 InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, 122 DominatorTree *Dt, DataLayout *Dl, unsigned VecWidth): 123 OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth), 124 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 125 126 // Perform the actual loop widening (vectorization). 127 void vectorize(LoopVectorizationLegality *Legal) { 128 // Create a new empty loop. Unlink the old loop and connect the new one. 129 createEmptyLoop(Legal); 130 // Widen each instruction in the old loop to a new one in the new loop. 131 // Use the Legality module to find the induction and reduction variables. 132 vectorizeLoop(Legal); 133 // Register the new loop and update the analysis passes. 134 updateAnalysis(); 135 } 136 137 private: 138 /// A small list of PHINodes. 139 typedef SmallVector<PHINode*, 4> PhiVector; 140 141 /// Add code that checks at runtime if the accessed arrays overlap. 142 /// Returns the comperator value or NULL if no check is needed. 143 Value *addRuntimeCheck(LoopVectorizationLegality *Legal, 144 Instruction *Loc); 145 /// Create an empty loop, based on the loop ranges of the old loop. 146 void createEmptyLoop(LoopVectorizationLegality *Legal); 147 /// Copy and widen the instructions from the old loop. 148 void vectorizeLoop(LoopVectorizationLegality *Legal); 149 150 /// A helper function that computes the predicate of the block BB, assuming 151 /// that the header block of the loop is set to True. It returns the *entry* 152 /// mask for the block BB. 153 Value *createBlockInMask(BasicBlock *BB); 154 /// A helper function that computes the predicate of the edge between SRC 155 /// and DST. 156 Value *createEdgeMask(BasicBlock *Src, BasicBlock *Dst); 157 158 /// A helper function to vectorize a single BB within the innermost loop. 159 void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, 160 PhiVector *PV); 161 162 /// Insert the new loop to the loop hierarchy and pass manager 163 /// and update the analysis passes. 164 void updateAnalysis(); 165 166 /// This instruction is un-vectorizable. Implement it as a sequence 167 /// of scalars. 168 void scalarizeInstruction(Instruction *Instr); 169 170 /// Create a broadcast instruction. This method generates a broadcast 171 /// instruction (shuffle) for loop invariant values and for the induction 172 /// value. If this is the induction variable then we extend it to N, N+1, ... 173 /// this is needed because each iteration in the loop corresponds to a SIMD 174 /// element. 175 Value *getBroadcastInstrs(Value *V); 176 177 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 178 /// for each element in the vector. Starting from zero. 179 Value *getConsecutiveVector(Value* Val); 180 181 /// When we go over instructions in the basic block we rely on previous 182 /// values within the current basic block or on loop invariant values. 183 /// When we widen (vectorize) values we place them in the map. If the values 184 /// are not within the map, they have to be loop invariant, so we simply 185 /// broadcast them into a vector. 186 Value *getVectorValue(Value *V); 187 188 /// Get a uniform vector of constant integers. We use this to get 189 /// vectors of ones and zeros for the reduction code. 190 Constant* getUniformVector(unsigned Val, Type* ScalarTy); 191 192 typedef DenseMap<Value*, Value*> ValueMap; 193 194 /// The original loop. 195 Loop *OrigLoop; 196 // Scev analysis to use. 197 ScalarEvolution *SE; 198 // Loop Info. 199 LoopInfo *LI; 200 // Dominator Tree. 201 DominatorTree *DT; 202 // Data Layout. 203 DataLayout *DL; 204 // The vectorization factor to use. 205 unsigned VF; 206 207 // The builder that we use 208 IRBuilder<> Builder; 209 210 // --- Vectorization state --- 211 212 /// The vector-loop preheader. 213 BasicBlock *LoopVectorPreHeader; 214 /// The scalar-loop preheader. 215 BasicBlock *LoopScalarPreHeader; 216 /// Middle Block between the vector and the scalar. 217 BasicBlock *LoopMiddleBlock; 218 ///The ExitBlock of the scalar loop. 219 BasicBlock *LoopExitBlock; 220 ///The vector loop body. 221 BasicBlock *LoopVectorBody; 222 ///The scalar loop body. 223 BasicBlock *LoopScalarBody; 224 ///The first bypass block. 225 BasicBlock *LoopBypassBlock; 226 227 /// The new Induction variable which was added to the new block. 228 PHINode *Induction; 229 /// The induction variable of the old basic block. 230 PHINode *OldInduction; 231 // Maps scalars to widened vectors. 232 ValueMap WidenMap; 233 }; 234 235 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 236 /// to what vectorization factor. 237 /// This class does not look at the profitability of vectorization, only the 238 /// legality. This class has two main kinds of checks: 239 /// * Memory checks - The code in canVectorizeMemory checks if vectorization 240 /// will change the order of memory accesses in a way that will change the 241 /// correctness of the program. 242 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory 243 /// checks for a number of different conditions, such as the availability of a 244 /// single induction variable, that all types are supported and vectorize-able, 245 /// etc. This code reflects the capabilities of InnerLoopVectorizer. 246 /// This class is also used by InnerLoopVectorizer for identifying 247 /// induction variable and the different reduction variables. 248 class LoopVectorizationLegality { 249 public: 250 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl, 251 DominatorTree *Dt): 252 TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { } 253 254 /// This represents the kinds of reductions that we support. 255 enum ReductionKind { 256 NoReduction, /// Not a reduction. 257 IntegerAdd, /// Sum of numbers. 258 IntegerMult, /// Product of numbers. 259 IntegerOr, /// Bitwise or logical OR of numbers. 260 IntegerAnd, /// Bitwise or logical AND of numbers. 261 IntegerXor /// Bitwise or logical XOR of numbers. 262 }; 263 264 /// This POD struct holds information about reduction variables. 265 struct ReductionDescriptor { 266 // Default C'tor 267 ReductionDescriptor(): 268 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} 269 270 // C'tor. 271 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): 272 StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 273 274 // The starting value of the reduction. 275 // It does not have to be zero! 276 Value *StartValue; 277 // The instruction who's value is used outside the loop. 278 Instruction *LoopExitInstr; 279 // The kind of the reduction. 280 ReductionKind Kind; 281 }; 282 283 // This POD struct holds information about the memory runtime legality 284 // check that a group of pointers do not overlap. 285 struct RuntimePointerCheck { 286 RuntimePointerCheck(): Need(false) {} 287 288 /// Reset the state of the pointer runtime information. 289 void reset() { 290 Need = false; 291 Pointers.clear(); 292 Starts.clear(); 293 Ends.clear(); 294 } 295 296 /// Insert a pointer and calculate the start and end SCEVs. 297 void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) { 298 const SCEV *Sc = SE->getSCEV(Ptr); 299 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 300 assert(AR && "Invalid addrec expression"); 301 const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); 302 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 303 Pointers.push_back(Ptr); 304 Starts.push_back(AR->getStart()); 305 Ends.push_back(ScEnd); 306 } 307 308 /// This flag indicates if we need to add the runtime check. 309 bool Need; 310 /// Holds the pointers that we need to check. 311 SmallVector<Value*, 2> Pointers; 312 /// Holds the pointer value at the beginning of the loop. 313 SmallVector<const SCEV*, 2> Starts; 314 /// Holds the pointer value at the end of the loop. 315 SmallVector<const SCEV*, 2> Ends; 316 }; 317 318 /// ReductionList contains the reduction descriptors for all 319 /// of the reductions that were found in the loop. 320 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 321 322 /// InductionList saves induction variables and maps them to the initial 323 /// value entring the loop. 324 typedef DenseMap<PHINode*, Value*> InductionList; 325 326 /// Returns true if it is legal to vectorize this loop. 327 /// This does not mean that it is profitable to vectorize this 328 /// loop, only that it is legal to do so. 329 bool canVectorize(); 330 331 /// Returns the Induction variable. 332 PHINode *getInduction() {return Induction;} 333 334 /// Returns the reduction variables found in the loop. 335 ReductionList *getReductionVars() { return &Reductions; } 336 337 /// Returns the induction variables found in the loop. 338 InductionList *getInductionVars() { return &Inductions; } 339 340 /// Return true if the block BB needs to be predicated in order for the loop 341 /// to be vectorized. 342 bool blockNeedsPredication(BasicBlock *BB); 343 344 /// Check if this pointer is consecutive when vectorizing. This happens 345 /// when the last index of the GEP is the induction variable, or that the 346 /// pointer itself is an induction variable. 347 /// This check allows us to vectorize A[idx] into a wide load/store. 348 bool isConsecutivePtr(Value *Ptr); 349 350 /// Returns true if the value V is uniform within the loop. 351 bool isUniform(Value *V); 352 353 /// Returns true if this instruction will remain scalar after vectorization. 354 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} 355 356 /// Returns the information that we collected about runtime memory check. 357 RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; } 358 private: 359 /// Check if a single basic block loop is vectorizable. 360 /// At this point we know that this is a loop with a constant trip count 361 /// and we only need to check individual instructions. 362 bool canVectorizeInstrs(); 363 364 /// When we vectorize loops we may change the order in which 365 /// we read and write from memory. This method checks if it is 366 /// legal to vectorize the code, considering only memory constrains. 367 /// Returns true if the loop is vectorizable 368 bool canVectorizeMemory(); 369 370 /// Return true if we can vectorize this loop using the IF-conversion 371 /// transformation. 372 bool canVectorizeWithIfConvert(); 373 374 /// Collect the variables that need to stay uniform after vectorization. 375 void collectLoopUniforms(); 376 377 /// Return true if all of the instructions in the block can be speculatively 378 /// executed. 379 bool blockCanBePredicated(BasicBlock *BB); 380 381 /// Returns True, if 'Phi' is the kind of reduction variable for type 382 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 383 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 384 /// Returns true if the instruction I can be a reduction variable of type 385 /// 'Kind'. 386 bool isReductionInstr(Instruction *I, ReductionKind Kind); 387 /// Returns True, if 'Phi' is an induction variable. 388 bool isInductionVariable(PHINode *Phi); 389 /// Return true if can compute the address bounds of Ptr within the loop. 390 bool hasComputableBounds(Value *Ptr); 391 392 /// The loop that we evaluate. 393 Loop *TheLoop; 394 /// Scev analysis. 395 ScalarEvolution *SE; 396 /// DataLayout analysis. 397 DataLayout *DL; 398 // Dominators. 399 DominatorTree *DT; 400 401 // --- vectorization state --- // 402 403 /// Holds the integer induction variable. This is the counter of the 404 /// loop. 405 PHINode *Induction; 406 /// Holds the reduction variables. 407 ReductionList Reductions; 408 /// Holds all of the induction variables that we found in the loop. 409 /// Notice that inductions don't need to start at zero and that induction 410 /// variables can be pointers. 411 InductionList Inductions; 412 413 /// Allowed outside users. This holds the reduction 414 /// vars which can be accessed from outside the loop. 415 SmallPtrSet<Value*, 4> AllowedExit; 416 /// This set holds the variables which are known to be uniform after 417 /// vectorization. 418 SmallPtrSet<Instruction*, 4> Uniforms; 419 /// We need to check that all of the pointers in this list are disjoint 420 /// at runtime. 421 RuntimePointerCheck PtrRtCheck; 422 }; 423 424 /// LoopVectorizationCostModel - estimates the expected speedups due to 425 /// vectorization. 426 /// In many cases vectorization is not profitable. This can happen because 427 /// of a number of reasons. In this class we mainly attempt to predict 428 /// the expected speedup/slowdowns due to the supported instruction set. 429 /// We use the VectorTargetTransformInfo to query the different backends 430 /// for the cost of different operations. 431 class LoopVectorizationCostModel { 432 public: 433 /// C'tor. 434 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, 435 LoopVectorizationLegality *Leg, 436 const VectorTargetTransformInfo *Vtti): 437 TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } 438 439 /// Returns the most profitable vectorization factor for the loop that is 440 /// smaller or equal to the VF argument. This method checks every power 441 /// of two up to VF. 442 unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize); 443 444 private: 445 /// Returns the expected execution cost. The unit of the cost does 446 /// not matter because we use the 'cost' units to compare different 447 /// vector widths. The cost that is returned is *not* normalized by 448 /// the factor width. 449 unsigned expectedCost(unsigned VF); 450 451 /// Returns the execution time cost of an instruction for a given vector 452 /// width. Vector width of one means scalar. 453 unsigned getInstructionCost(Instruction *I, unsigned VF); 454 455 /// A helper function for converting Scalar types to vector types. 456 /// If the incoming type is void, we return void. If the VF is 1, we return 457 /// the scalar type. 458 static Type* ToVectorTy(Type *Scalar, unsigned VF); 459 460 /// The loop that we evaluate. 461 Loop *TheLoop; 462 /// Scev analysis. 463 ScalarEvolution *SE; 464 465 /// Vectorization legality. 466 LoopVectorizationLegality *Legal; 467 /// Vector target information. 468 const VectorTargetTransformInfo *VTTI; 469 }; 470 471 struct LoopVectorize : public LoopPass { 472 static char ID; // Pass identification, replacement for typeid 473 474 LoopVectorize() : LoopPass(ID) { 475 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 476 } 477 478 ScalarEvolution *SE; 479 DataLayout *DL; 480 LoopInfo *LI; 481 TargetTransformInfo *TTI; 482 DominatorTree *DT; 483 484 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 485 // We only vectorize innermost loops. 486 if (!L->empty()) 487 return false; 488 489 SE = &getAnalysis<ScalarEvolution>(); 490 DL = getAnalysisIfAvailable<DataLayout>(); 491 LI = &getAnalysis<LoopInfo>(); 492 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 493 DT = &getAnalysis<DominatorTree>(); 494 495 DEBUG(dbgs() << "LV: Checking a loop in \"" << 496 L->getHeader()->getParent()->getName() << "\"\n"); 497 498 // Check if it is legal to vectorize the loop. 499 LoopVectorizationLegality LVL(L, SE, DL, DT); 500 if (!LVL.canVectorize()) { 501 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 502 return false; 503 } 504 505 // Select the preffered vectorization factor. 506 unsigned VF = 1; 507 if (VectorizationFactor == 0) { 508 const VectorTargetTransformInfo *VTTI = 0; 509 if (TTI) 510 VTTI = TTI->getVectorTargetTransformInfo(); 511 // Use the cost model. 512 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 513 VF = CM.findBestVectorizationFactor(); 514 515 if (VF == 1) { 516 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 517 return false; 518 } 519 520 } else { 521 // Use the user command flag. 522 VF = VectorizationFactor; 523 } 524 525 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 526 L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< 527 "\n"); 528 529 // If we decided that it is *legal* to vectorizer the loop then do it. 530 InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF); 531 LB.vectorize(&LVL); 532 533 DEBUG(verifyFunction(*L->getHeader()->getParent())); 534 return true; 535 } 536 537 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 538 LoopPass::getAnalysisUsage(AU); 539 AU.addRequiredID(LoopSimplifyID); 540 AU.addRequiredID(LCSSAID); 541 AU.addRequired<LoopInfo>(); 542 AU.addRequired<ScalarEvolution>(); 543 AU.addRequired<DominatorTree>(); 544 AU.addPreserved<LoopInfo>(); 545 AU.addPreserved<DominatorTree>(); 546 } 547 548 }; 549 550 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 551 // Create the types. 552 LLVMContext &C = V->getContext(); 553 Type *VTy = VectorType::get(V->getType(), VF); 554 Type *I32 = IntegerType::getInt32Ty(C); 555 556 // Save the current insertion location. 557 Instruction *Loc = Builder.GetInsertPoint(); 558 559 // We need to place the broadcast of invariant variables outside the loop. 560 bool Invariant = (OrigLoop->isLoopInvariant(V) && V != Induction); 561 562 // Place the code for broadcasting invariant variables in the new preheader. 563 if (Invariant) 564 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 565 566 Constant *Zero = ConstantInt::get(I32, 0); 567 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 568 Value *UndefVal = UndefValue::get(VTy); 569 // Insert the value into a new vector. 570 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 571 // Broadcast the scalar into all locations in the vector. 572 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 573 "broadcast"); 574 575 // Restore the builder insertion point. 576 if (Invariant) 577 Builder.SetInsertPoint(Loc); 578 579 return Shuf; 580 } 581 582 Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val) { 583 assert(Val->getType()->isVectorTy() && "Must be a vector"); 584 assert(Val->getType()->getScalarType()->isIntegerTy() && 585 "Elem must be an integer"); 586 // Create the types. 587 Type *ITy = Val->getType()->getScalarType(); 588 VectorType *Ty = cast<VectorType>(Val->getType()); 589 unsigned VLen = Ty->getNumElements(); 590 SmallVector<Constant*, 8> Indices; 591 592 // Create a vector of consecutive numbers from zero to VF. 593 for (unsigned i = 0; i < VLen; ++i) 594 Indices.push_back(ConstantInt::get(ITy, i)); 595 596 // Add the consecutive indices to the vector value. 597 Constant *Cv = ConstantVector::get(Indices); 598 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 599 return Builder.CreateAdd(Val, Cv, "induction"); 600 } 601 602 bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 603 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 604 605 // If this pointer is an induction variable, return it. 606 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 607 if (Phi && getInductionVars()->count(Phi)) 608 return true; 609 610 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 611 if (!Gep) 612 return false; 613 614 unsigned NumOperands = Gep->getNumOperands(); 615 Value *LastIndex = Gep->getOperand(NumOperands - 1); 616 617 // Check that all of the gep indices are uniform except for the last. 618 for (unsigned i = 0; i < NumOperands - 1; ++i) 619 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 620 return false; 621 622 // We can emit wide load/stores only if the last index is the induction 623 // variable. 624 const SCEV *Last = SE->getSCEV(LastIndex); 625 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 626 const SCEV *Step = AR->getStepRecurrence(*SE); 627 628 // The memory is consecutive because the last index is consecutive 629 // and all other indices are loop invariant. 630 if (Step->isOne()) 631 return true; 632 } 633 634 return false; 635 } 636 637 bool LoopVectorizationLegality::isUniform(Value *V) { 638 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 639 } 640 641 Value *InnerLoopVectorizer::getVectorValue(Value *V) { 642 assert(V != Induction && "The new induction variable should not be used."); 643 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 644 // If we saved a vectorized copy of V, use it. 645 Value *&MapEntry = WidenMap[V]; 646 if (MapEntry) 647 return MapEntry; 648 649 // Broadcast V and save the value for future uses. 650 Value *B = getBroadcastInstrs(V); 651 MapEntry = B; 652 return B; 653 } 654 655 Constant* 656 InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 657 return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); 658 } 659 660 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 661 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 662 // Holds vector parameters or scalars, in case of uniform vals. 663 SmallVector<Value*, 8> Params; 664 665 // Find all of the vectorized parameters. 666 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 667 Value *SrcOp = Instr->getOperand(op); 668 669 // If we are accessing the old induction variable, use the new one. 670 if (SrcOp == OldInduction) { 671 Params.push_back(getVectorValue(SrcOp)); 672 continue; 673 } 674 675 // Try using previously calculated values. 676 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 677 678 // If the src is an instruction that appeared earlier in the basic block 679 // then it should already be vectorized. 680 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 681 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 682 // The parameter is a vector value from earlier. 683 Params.push_back(WidenMap[SrcInst]); 684 } else { 685 // The parameter is a scalar from outside the loop. Maybe even a constant. 686 Params.push_back(SrcOp); 687 } 688 } 689 690 assert(Params.size() == Instr->getNumOperands() && 691 "Invalid number of operands"); 692 693 // Does this instruction return a value ? 694 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 695 Value *VecResults = 0; 696 697 // If we have a return value, create an empty vector. We place the scalarized 698 // instructions in this vector. 699 if (!IsVoidRetTy) 700 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 701 702 // For each scalar that we create: 703 for (unsigned i = 0; i < VF; ++i) { 704 Instruction *Cloned = Instr->clone(); 705 if (!IsVoidRetTy) 706 Cloned->setName(Instr->getName() + ".cloned"); 707 // Replace the operands of the cloned instrucions with extracted scalars. 708 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 709 Value *Op = Params[op]; 710 // Param is a vector. Need to extract the right lane. 711 if (Op->getType()->isVectorTy()) 712 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 713 Cloned->setOperand(op, Op); 714 } 715 716 // Place the cloned scalar in the new loop. 717 Builder.Insert(Cloned); 718 719 // If the original scalar returns a value we need to place it in a vector 720 // so that future users will be able to use it. 721 if (!IsVoidRetTy) 722 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 723 Builder.getInt32(i)); 724 } 725 726 if (!IsVoidRetTy) 727 WidenMap[Instr] = VecResults; 728 } 729 730 Value* 731 InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 732 Instruction *Loc) { 733 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 734 Legal->getRuntimePointerCheck(); 735 736 if (!PtrRtCheck->Need) 737 return NULL; 738 739 Value *MemoryRuntimeCheck = 0; 740 unsigned NumPointers = PtrRtCheck->Pointers.size(); 741 SmallVector<Value* , 2> Starts; 742 SmallVector<Value* , 2> Ends; 743 744 SCEVExpander Exp(*SE, "induction"); 745 746 // Use this type for pointer arithmetic. 747 Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); 748 749 for (unsigned i = 0; i < NumPointers; ++i) { 750 Value *Ptr = PtrRtCheck->Pointers[i]; 751 const SCEV *Sc = SE->getSCEV(Ptr); 752 753 if (SE->isLoopInvariant(Sc, OrigLoop)) { 754 DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << 755 *Ptr <<"\n"); 756 Starts.push_back(Ptr); 757 Ends.push_back(Ptr); 758 } else { 759 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 760 761 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], 762 PtrArithTy, Loc); 763 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 764 Starts.push_back(Start); 765 Ends.push_back(End); 766 } 767 } 768 769 for (unsigned i = 0; i < NumPointers; ++i) { 770 for (unsigned j = i+1; j < NumPointers; ++j) { 771 Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 772 Starts[i], Ends[j], "bound0", Loc); 773 Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 774 Starts[j], Ends[i], "bound1", Loc); 775 Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, 776 "found.conflict", Loc); 777 if (MemoryRuntimeCheck) 778 MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, 779 MemoryRuntimeCheck, 780 IsConflict, 781 "conflict.rdx", Loc); 782 else 783 MemoryRuntimeCheck = IsConflict; 784 785 } 786 } 787 788 return MemoryRuntimeCheck; 789 } 790 791 void 792 InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 793 /* 794 In this function we generate a new loop. The new loop will contain 795 the vectorized instructions while the old loop will continue to run the 796 scalar remainder. 797 798 [ ] <-- vector loop bypass. 799 / | 800 / v 801 | [ ] <-- vector pre header. 802 | | 803 | v 804 | [ ] \ 805 | [ ]_| <-- vector loop. 806 | | 807 \ v 808 >[ ] <--- middle-block. 809 / | 810 / v 811 | [ ] <--- new preheader. 812 | | 813 | v 814 | [ ] \ 815 | [ ]_| <-- old scalar loop to handle remainder. 816 \ | 817 \ v 818 >[ ] <-- exit block. 819 ... 820 */ 821 822 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 823 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 824 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 825 assert(ExitBlock && "Must have an exit block"); 826 827 // Some loops have a single integer induction variable, while other loops 828 // don't. One example is c++ iterators that often have multiple pointer 829 // induction variables. In the code below we also support a case where we 830 // don't have a single induction variable. 831 OldInduction = Legal->getInduction(); 832 Type *IdxTy = OldInduction ? OldInduction->getType() : 833 DL->getIntPtrType(SE->getContext()); 834 835 // Find the loop boundaries. 836 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); 837 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 838 839 // Get the total trip count from the count by adding 1. 840 ExitCount = SE->getAddExpr(ExitCount, 841 SE->getConstant(ExitCount->getType(), 1)); 842 843 // Expand the trip count and place the new instructions in the preheader. 844 // Notice that the pre-header does not change, only the loop body. 845 SCEVExpander Exp(*SE, "induction"); 846 847 // Count holds the overall loop count (N). 848 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 849 BypassBlock->getTerminator()); 850 851 // The loop index does not have to start at Zero. Find the original start 852 // value from the induction PHI node. If we don't have an induction variable 853 // then we know that it starts at zero. 854 Value *StartIdx = OldInduction ? 855 OldInduction->getIncomingValueForBlock(BypassBlock): 856 ConstantInt::get(IdxTy, 0); 857 858 assert(BypassBlock && "Invalid loop structure"); 859 860 // Generate the code that checks in runtime if arrays overlap. 861 Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, 862 BypassBlock->getTerminator()); 863 864 // Split the single block loop into the two loop structure described above. 865 BasicBlock *VectorPH = 866 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 867 BasicBlock *VecBody = 868 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 869 BasicBlock *MiddleBlock = 870 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 871 BasicBlock *ScalarPH = 872 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 873 874 // This is the location in which we add all of the logic for bypassing 875 // the new vector loop. 876 Instruction *Loc = BypassBlock->getTerminator(); 877 878 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 879 // inside the loop. 880 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 881 882 // Generate the induction variable. 883 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 884 Constant *Step = ConstantInt::get(IdxTy, VF); 885 886 // We may need to extend the index in case there is a type mismatch. 887 // We know that the count starts at zero and does not overflow. 888 if (Count->getType() != IdxTy) { 889 // The exit count can be of pointer type. Convert it to the correct 890 // integer type. 891 if (ExitCount->getType()->isPointerTy()) 892 Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); 893 else 894 Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); 895 } 896 897 // Add the start index to the loop count to get the new end index. 898 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 899 900 // Now we need to generate the expression for N - (N % VF), which is 901 // the part that the vectorized body will execute. 902 Constant *CIVF = ConstantInt::get(IdxTy, VF); 903 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 904 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 905 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 906 "end.idx.rnd.down", Loc); 907 908 // Now, compare the new count to zero. If it is zero skip the vector loop and 909 // jump to the scalar loop. 910 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 911 IdxEndRoundDown, 912 StartIdx, 913 "cmp.zero", Loc); 914 915 // If we are using memory runtime checks, include them in. 916 if (MemoryRuntimeCheck) 917 Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, 918 "CntOrMem", Loc); 919 920 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 921 // Remove the old terminator. 922 Loc->eraseFromParent(); 923 924 // We are going to resume the execution of the scalar loop. 925 // Go over all of the induction variables that we found and fix the 926 // PHIs that are left in the scalar version of the loop. 927 // The starting values of PHI nodes depend on the counter of the last 928 // iteration in the vectorized loop. 929 // If we come from a bypass edge then we need to start from the original start 930 // value. 931 932 // This variable saves the new starting index for the scalar loop. 933 PHINode *ResumeIndex = 0; 934 LoopVectorizationLegality::InductionList::iterator I, E; 935 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 936 for (I = List->begin(), E = List->end(); I != E; ++I) { 937 PHINode *OrigPhi = I->first; 938 PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", 939 MiddleBlock->getTerminator()); 940 Value *EndValue = 0; 941 if (OrigPhi->getType()->isIntegerTy()) { 942 // Handle the integer induction counter: 943 assert(OrigPhi == OldInduction && "Unknown integer PHI"); 944 // We know what the end value is. 945 EndValue = IdxEndRoundDown; 946 // We also know which PHI node holds it. 947 ResumeIndex = ResumeVal; 948 } else { 949 // For pointer induction variables, calculate the offset using 950 // the end index. 951 EndValue = GetElementPtrInst::Create(I->second, CountRoundDown, 952 "ptr.ind.end", 953 BypassBlock->getTerminator()); 954 } 955 956 // The new PHI merges the original incoming value, in case of a bypass, 957 // or the value at the end of the vectorized loop. 958 ResumeVal->addIncoming(I->second, BypassBlock); 959 ResumeVal->addIncoming(EndValue, VecBody); 960 961 // Fix the scalar body counter (PHI node). 962 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 963 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 964 } 965 966 // If we are generating a new induction variable then we also need to 967 // generate the code that calculates the exit value. This value is not 968 // simply the end of the counter because we may skip the vectorized body 969 // in case of a runtime check. 970 if (!OldInduction){ 971 assert(!ResumeIndex && "Unexpected resume value found"); 972 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 973 MiddleBlock->getTerminator()); 974 ResumeIndex->addIncoming(StartIdx, BypassBlock); 975 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 976 } 977 978 // Make sure that we found the index where scalar loop needs to continue. 979 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 980 "Invalid resume Index"); 981 982 // Add a check in the middle block to see if we have completed 983 // all of the iterations in the first vector loop. 984 // If (N - N%VF) == N, then we *don't* need to run the remainder. 985 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 986 ResumeIndex, "cmp.n", 987 MiddleBlock->getTerminator()); 988 989 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 990 // Remove the old terminator. 991 MiddleBlock->getTerminator()->eraseFromParent(); 992 993 // Create i+1 and fill the PHINode. 994 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 995 Induction->addIncoming(StartIdx, VectorPH); 996 Induction->addIncoming(NextIdx, VecBody); 997 // Create the compare. 998 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 999 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 1000 1001 // Now we have two terminators. Remove the old one from the block. 1002 VecBody->getTerminator()->eraseFromParent(); 1003 1004 // Get ready to start creating new instructions into the vectorized body. 1005 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 1006 1007 // Create and register the new vector loop. 1008 Loop* Lp = new Loop(); 1009 Loop *ParentLoop = OrigLoop->getParentLoop(); 1010 1011 // Insert the new loop into the loop nest and register the new basic blocks. 1012 if (ParentLoop) { 1013 ParentLoop->addChildLoop(Lp); 1014 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 1015 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 1016 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 1017 } else { 1018 LI->addTopLevelLoop(Lp); 1019 } 1020 1021 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 1022 1023 // Save the state. 1024 LoopVectorPreHeader = VectorPH; 1025 LoopScalarPreHeader = ScalarPH; 1026 LoopMiddleBlock = MiddleBlock; 1027 LoopExitBlock = ExitBlock; 1028 LoopVectorBody = VecBody; 1029 LoopScalarBody = OldBasicBlock; 1030 LoopBypassBlock = BypassBlock; 1031 } 1032 1033 /// This function returns the identity element (or neutral element) for 1034 /// the operation K. 1035 static unsigned 1036 getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 1037 switch (K) { 1038 case LoopVectorizationLegality::IntegerXor: 1039 case LoopVectorizationLegality::IntegerAdd: 1040 case LoopVectorizationLegality::IntegerOr: 1041 // Adding, Xoring, Oring zero to a number does not change it. 1042 return 0; 1043 case LoopVectorizationLegality::IntegerMult: 1044 // Multiplying a number by 1 does not change it. 1045 return 1; 1046 case LoopVectorizationLegality::IntegerAnd: 1047 // AND-ing a number with an all-1 value does not change it. 1048 return -1; 1049 default: 1050 llvm_unreachable("Unknown reduction kind"); 1051 } 1052 } 1053 1054 void 1055 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 1056 //===------------------------------------------------===// 1057 // 1058 // Notice: any optimization or new instruction that go 1059 // into the code below should be also be implemented in 1060 // the cost-model. 1061 // 1062 //===------------------------------------------------===// 1063 BasicBlock &BB = *OrigLoop->getHeader(); 1064 Constant *Zero = ConstantInt::get( 1065 IntegerType::getInt32Ty(BB.getContext()), 0); 1066 1067 // In order to support reduction variables we need to be able to vectorize 1068 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 1069 // stages. First, we create a new vector PHI node with no incoming edges. 1070 // We use this value when we vectorize all of the instructions that use the 1071 // PHI. Next, after all of the instructions in the block are complete we 1072 // add the new incoming edges to the PHI. At this point all of the 1073 // instructions in the basic block are vectorized, so we can use them to 1074 // construct the PHI. 1075 PhiVector RdxPHIsToFix; 1076 1077 // Scan the loop in a topological order to ensure that defs are vectorized 1078 // before users. 1079 LoopBlocksDFS DFS(OrigLoop); 1080 DFS.perform(LI); 1081 1082 // Vectorize all of the blocks in the original loop. 1083 for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 1084 be = DFS.endRPO(); bb != be; ++bb) 1085 vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); 1086 1087 // At this point every instruction in the original loop is widened to 1088 // a vector form. We are almost done. Now, we need to fix the PHI nodes 1089 // that we vectorized. The PHI nodes are currently empty because we did 1090 // not want to introduce cycles. Notice that the remaining PHI nodes 1091 // that we need to fix are reduction variables. 1092 1093 // Create the 'reduced' values for each of the induction vars. 1094 // The reduced values are the vector values that we scalarize and combine 1095 // after the loop is finished. 1096 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 1097 it != e; ++it) { 1098 PHINode *RdxPhi = *it; 1099 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 1100 assert(RdxPhi && "Unable to recover vectorized PHI"); 1101 1102 // Find the reduction variable descriptor. 1103 assert(Legal->getReductionVars()->count(RdxPhi) && 1104 "Unable to find the reduction variable"); 1105 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1106 (*Legal->getReductionVars())[RdxPhi]; 1107 1108 // We need to generate a reduction vector from the incoming scalar. 1109 // To do so, we need to generate the 'identity' vector and overide 1110 // one of the elements with the incoming scalar reduction. We need 1111 // to do it in the vector-loop preheader. 1112 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 1113 1114 // This is the vector-clone of the value that leaves the loop. 1115 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1116 Type *VecTy = VectorExit->getType(); 1117 1118 // Find the reduction identity variable. Zero for addition, or, xor, 1119 // one for multiplication, -1 for And. 1120 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 1121 VecTy->getScalarType()); 1122 1123 // This vector is the Identity vector where the first element is the 1124 // incoming scalar reduction. 1125 Value *VectorStart = Builder.CreateInsertElement(Identity, 1126 RdxDesc.StartValue, Zero); 1127 1128 // Fix the vector-loop phi. 1129 // We created the induction variable so we know that the 1130 // preheader is the first entry. 1131 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1132 1133 // Reductions do not have to start at zero. They can start with 1134 // any loop invariant values. 1135 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 1136 Value *Val = 1137 getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 1138 VecRdxPhi->addIncoming(Val, LoopVectorBody); 1139 1140 // Before each round, move the insertion point right between 1141 // the PHIs and the values we are going to write. 1142 // This allows us to write both PHINodes and the extractelement 1143 // instructions. 1144 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1145 1146 // This PHINode contains the vectorized reduction variable, or 1147 // the initial value vector, if we bypass the vector loop. 1148 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1149 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 1150 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 1151 1152 // Extract the first scalar. 1153 Value *Scalar0 = 1154 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); 1155 // Extract and reduce the remaining vector elements. 1156 for (unsigned i=1; i < VF; ++i) { 1157 Value *Scalar1 = 1158 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); 1159 switch (RdxDesc.Kind) { 1160 case LoopVectorizationLegality::IntegerAdd: 1161 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); 1162 break; 1163 case LoopVectorizationLegality::IntegerMult: 1164 Scalar0 = Builder.CreateMul(Scalar0, Scalar1); 1165 break; 1166 case LoopVectorizationLegality::IntegerOr: 1167 Scalar0 = Builder.CreateOr(Scalar0, Scalar1); 1168 break; 1169 case LoopVectorizationLegality::IntegerAnd: 1170 Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); 1171 break; 1172 case LoopVectorizationLegality::IntegerXor: 1173 Scalar0 = Builder.CreateXor(Scalar0, Scalar1); 1174 break; 1175 default: 1176 llvm_unreachable("Unknown reduction operation"); 1177 } 1178 } 1179 1180 // Now, we need to fix the users of the reduction variable 1181 // inside and outside of the scalar remainder loop. 1182 // We know that the loop is in LCSSA form. We need to update the 1183 // PHI nodes in the exit blocks. 1184 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1185 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1186 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1187 if (!LCSSAPhi) continue; 1188 1189 // All PHINodes need to have a single entry edge, or two if 1190 // we already fixed them. 1191 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1192 1193 // We found our reduction value exit-PHI. Update it with the 1194 // incoming bypass edge. 1195 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1196 // Add an edge coming from the bypass. 1197 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1198 break; 1199 } 1200 }// end of the LCSSA phi scan. 1201 1202 // Fix the scalar loop reduction variable with the incoming reduction sum 1203 // from the vector body and from the backedge value. 1204 int IncomingEdgeBlockIdx = 1205 (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); 1206 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 1207 // Pick the other block. 1208 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 1209 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1210 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1211 }// end of for each redux variable. 1212 } 1213 1214 Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 1215 assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 1216 "Invalid edge"); 1217 1218 Value *SrcMask = createBlockInMask(Src); 1219 1220 // The terminator has to be a branch inst! 1221 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 1222 assert(BI && "Unexpected terminator found"); 1223 1224 Value *EdgeMask = SrcMask; 1225 if (BI->isConditional()) { 1226 EdgeMask = getVectorValue(BI->getCondition()); 1227 if (BI->getSuccessor(0) != Dst) 1228 EdgeMask = Builder.CreateNot(EdgeMask); 1229 } 1230 1231 return Builder.CreateAnd(EdgeMask, SrcMask); 1232 } 1233 1234 Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 1235 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 1236 1237 // Loop incoming mask is all-one. 1238 if (OrigLoop->getHeader() == BB) 1239 return getVectorValue( 1240 ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1)); 1241 1242 // This is the block mask. We OR all incoming edges, and with zero. 1243 Value *BlockMask = getVectorValue( 1244 ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0)); 1245 1246 // For each pred: 1247 for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) 1248 BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB)); 1249 1250 return BlockMask; 1251 } 1252 1253 void 1254 InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, 1255 BasicBlock *BB, PhiVector *PV) { 1256 Constant *Zero = 1257 ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0); 1258 1259 // For each instruction in the old loop. 1260 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1261 switch (it->getOpcode()) { 1262 case Instruction::Br: 1263 // Nothing to do for PHIs and BR, since we already took care of the 1264 // loop control flow instructions. 1265 continue; 1266 case Instruction::PHI:{ 1267 PHINode* P = cast<PHINode>(it); 1268 // Handle reduction variables: 1269 if (Legal->getReductionVars()->count(P)) { 1270 // This is phase one of vectorizing PHIs. 1271 Type *VecTy = VectorType::get(it->getType(), VF); 1272 WidenMap[it] = 1273 PHINode::Create(VecTy, 2, "vec.phi", 1274 LoopVectorBody->getFirstInsertionPt()); 1275 PV->push_back(P); 1276 continue; 1277 } 1278 1279 // Check for PHI nodes that are lowered to vector selects. 1280 if (P->getParent() != OrigLoop->getHeader()) { 1281 // We know that all PHIs in non header blocks are converted into 1282 // selects, so we don't have to worry about the insertion order and we 1283 // can just use the builder. 1284 1285 // At this point we generate the predication tree. There may be 1286 // duplications since this is a simple recursive scan, but future 1287 // optimizations will clean it up. 1288 Value *Cond = createBlockInMask(P->getIncomingBlock(0)); 1289 WidenMap[P] = 1290 Builder.CreateSelect(Cond, 1291 getVectorValue(P->getIncomingValue(0)), 1292 getVectorValue(P->getIncomingValue(1)), 1293 "predphi"); 1294 continue; 1295 } 1296 1297 // This PHINode must be an induction variable. 1298 // Make sure that we know about it. 1299 assert(Legal->getInductionVars()->count(P) && 1300 "Not an induction variable"); 1301 1302 if (P->getType()->isIntegerTy()) { 1303 assert(P == OldInduction && "Unexpected PHI"); 1304 Value *Broadcasted = getBroadcastInstrs(Induction); 1305 // After broadcasting the induction variable we need to make the 1306 // vector consecutive by adding 0, 1, 2 ... 1307 Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); 1308 1309 WidenMap[OldInduction] = ConsecutiveInduction; 1310 continue; 1311 } 1312 1313 // Handle pointer inductions. 1314 assert(P->getType()->isPointerTy() && "Unexpected type."); 1315 Value *StartIdx = OldInduction ? 1316 Legal->getInductionVars()->lookup(OldInduction) : 1317 ConstantInt::get(Induction->getType(), 0); 1318 1319 // This is the pointer value coming into the loop. 1320 Value *StartPtr = Legal->getInductionVars()->lookup(P); 1321 1322 // This is the normalized GEP that starts counting at zero. 1323 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 1324 "normalized.idx"); 1325 1326 // This is the vector of results. Notice that we don't generate vector 1327 // geps because scalar geps result in better code. 1328 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 1329 for (unsigned int i = 0; i < VF; ++i) { 1330 Constant *Idx = ConstantInt::get(Induction->getType(), i); 1331 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); 1332 Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep"); 1333 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 1334 Builder.getInt32(i), 1335 "insert.gep"); 1336 } 1337 1338 WidenMap[it] = VecVal; 1339 continue; 1340 } 1341 case Instruction::Add: 1342 case Instruction::FAdd: 1343 case Instruction::Sub: 1344 case Instruction::FSub: 1345 case Instruction::Mul: 1346 case Instruction::FMul: 1347 case Instruction::UDiv: 1348 case Instruction::SDiv: 1349 case Instruction::FDiv: 1350 case Instruction::URem: 1351 case Instruction::SRem: 1352 case Instruction::FRem: 1353 case Instruction::Shl: 1354 case Instruction::LShr: 1355 case Instruction::AShr: 1356 case Instruction::And: 1357 case Instruction::Or: 1358 case Instruction::Xor: { 1359 // Just widen binops. 1360 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 1361 Value *A = getVectorValue(it->getOperand(0)); 1362 Value *B = getVectorValue(it->getOperand(1)); 1363 1364 // Use this vector value for all users of the original instruction. 1365 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 1366 WidenMap[it] = V; 1367 1368 // Update the NSW, NUW and Exact flags. 1369 BinaryOperator *VecOp = cast<BinaryOperator>(V); 1370 if (isa<OverflowingBinaryOperator>(BinOp)) { 1371 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 1372 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 1373 } 1374 if (isa<PossiblyExactOperator>(VecOp)) 1375 VecOp->setIsExact(BinOp->isExact()); 1376 break; 1377 } 1378 case Instruction::Select: { 1379 // Widen selects. 1380 // If the selector is loop invariant we can create a select 1381 // instruction with a scalar condition. Otherwise, use vector-select. 1382 Value *Cond = it->getOperand(0); 1383 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 1384 1385 // The condition can be loop invariant but still defined inside the 1386 // loop. This means that we can't just use the original 'cond' value. 1387 // We have to take the 'vectorized' value and pick the first lane. 1388 // Instcombine will make this a no-op. 1389 Cond = getVectorValue(Cond); 1390 if (InvariantCond) 1391 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 1392 1393 Value *Op0 = getVectorValue(it->getOperand(1)); 1394 Value *Op1 = getVectorValue(it->getOperand(2)); 1395 WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1); 1396 break; 1397 } 1398 1399 case Instruction::ICmp: 1400 case Instruction::FCmp: { 1401 // Widen compares. Generate vector compares. 1402 bool FCmp = (it->getOpcode() == Instruction::FCmp); 1403 CmpInst *Cmp = dyn_cast<CmpInst>(it); 1404 Value *A = getVectorValue(it->getOperand(0)); 1405 Value *B = getVectorValue(it->getOperand(1)); 1406 if (FCmp) 1407 WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 1408 else 1409 WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 1410 break; 1411 } 1412 1413 case Instruction::Store: { 1414 // Attempt to issue a wide store. 1415 StoreInst *SI = dyn_cast<StoreInst>(it); 1416 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 1417 Value *Ptr = SI->getPointerOperand(); 1418 unsigned Alignment = SI->getAlignment(); 1419 1420 assert(!Legal->isUniform(Ptr) && 1421 "We do not allow storing to uniform addresses"); 1422 1423 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1424 1425 // This store does not use GEPs. 1426 if (!Legal->isConsecutivePtr(Ptr)) { 1427 scalarizeInstruction(it); 1428 break; 1429 } 1430 1431 if (Gep) { 1432 // The last index does not have to be the induction. It can be 1433 // consecutive and be a function of the index. For example A[I+1]; 1434 unsigned NumOperands = Gep->getNumOperands(); 1435 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 1436 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1437 1438 // Create the new GEP with the new induction variable. 1439 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1440 Gep2->setOperand(NumOperands - 1, LastIndex); 1441 Ptr = Builder.Insert(Gep2); 1442 } else { 1443 // Use the induction element ptr. 1444 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1445 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1446 } 1447 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 1448 Value *Val = getVectorValue(SI->getValueOperand()); 1449 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 1450 break; 1451 } 1452 case Instruction::Load: { 1453 // Attempt to issue a wide load. 1454 LoadInst *LI = dyn_cast<LoadInst>(it); 1455 Type *RetTy = VectorType::get(LI->getType(), VF); 1456 Value *Ptr = LI->getPointerOperand(); 1457 unsigned Alignment = LI->getAlignment(); 1458 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1459 1460 // If the pointer is loop invariant or if it is non consecutive, 1461 // scalarize the load. 1462 bool Con = Legal->isConsecutivePtr(Ptr); 1463 if (Legal->isUniform(Ptr) || !Con) { 1464 scalarizeInstruction(it); 1465 break; 1466 } 1467 1468 if (Gep) { 1469 // The last index does not have to be the induction. It can be 1470 // consecutive and be a function of the index. For example A[I+1]; 1471 unsigned NumOperands = Gep->getNumOperands(); 1472 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 1473 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1474 1475 // Create the new GEP with the new induction variable. 1476 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1477 Gep2->setOperand(NumOperands - 1, LastIndex); 1478 Ptr = Builder.Insert(Gep2); 1479 } else { 1480 // Use the induction element ptr. 1481 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1482 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1483 } 1484 1485 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 1486 LI = Builder.CreateLoad(Ptr); 1487 LI->setAlignment(Alignment); 1488 // Use this vector value for all users of the load. 1489 WidenMap[it] = LI; 1490 break; 1491 } 1492 case Instruction::ZExt: 1493 case Instruction::SExt: 1494 case Instruction::FPToUI: 1495 case Instruction::FPToSI: 1496 case Instruction::FPExt: 1497 case Instruction::PtrToInt: 1498 case Instruction::IntToPtr: 1499 case Instruction::SIToFP: 1500 case Instruction::UIToFP: 1501 case Instruction::Trunc: 1502 case Instruction::FPTrunc: 1503 case Instruction::BitCast: { 1504 /// Vectorize bitcasts. 1505 CastInst *CI = dyn_cast<CastInst>(it); 1506 Value *A = getVectorValue(it->getOperand(0)); 1507 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1508 WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 1509 break; 1510 } 1511 1512 default: 1513 /// All other instructions are unsupported. Scalarize them. 1514 scalarizeInstruction(it); 1515 break; 1516 }// end of switch. 1517 }// end of for_each instr. 1518 } 1519 1520 1521 void InnerLoopVectorizer::updateAnalysis() { 1522 // Forget the original basic block. 1523 SE->forgetLoop(OrigLoop); 1524 1525 // Update the dominator tree information. 1526 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1527 "Entry does not dominate exit."); 1528 1529 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1530 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1531 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1532 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1533 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1534 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1535 1536 DEBUG(DT->verifyAnalysis()); 1537 } 1538 1539 1540 bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 1541 if (!EnableIfConversion) 1542 return false; 1543 1544 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 1545 std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); 1546 1547 // Collect the blocks that need predication. 1548 for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { 1549 BasicBlock *BB = LoopBlocks[i]; 1550 1551 // We must have at most two predecessors because we need to convert 1552 // all PHIs to selects. 1553 unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); 1554 if (Preds > 2) 1555 return false; 1556 1557 // We must be able to predicate all blocks that need to be predicated. 1558 if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) 1559 return false; 1560 } 1561 1562 // We can if-convert this loop. 1563 return true; 1564 } 1565 1566 bool LoopVectorizationLegality::canVectorize() { 1567 assert(TheLoop->getLoopPreheader() && "No preheader!!"); 1568 1569 // We can only vectorize innermost loops. 1570 if (TheLoop->getSubLoopsVector().size()) 1571 return false; 1572 1573 // We must have a single backedge. 1574 if (TheLoop->getNumBackEdges() != 1) 1575 return false; 1576 1577 // We must have a single exiting block. 1578 if (!TheLoop->getExitingBlock()) 1579 return false; 1580 1581 unsigned NumBlocks = TheLoop->getNumBlocks(); 1582 1583 // Check if we can if-convert non single-bb loops. 1584 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 1585 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 1586 return false; 1587 } 1588 1589 // We need to have a loop header. 1590 BasicBlock *Latch = TheLoop->getLoopLatch(); 1591 DEBUG(dbgs() << "LV: Found a loop: " << 1592 TheLoop->getHeader()->getName() << "\n"); 1593 1594 // ScalarEvolution needs to be able to find the exit count. 1595 const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); 1596 if (ExitCount == SE->getCouldNotCompute()) { 1597 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1598 return false; 1599 } 1600 1601 // Do not loop-vectorize loops with a tiny trip count. 1602 unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); 1603 if (TC > 0u && TC < TinyTripCountThreshold) { 1604 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1605 "This loop is not worth vectorizing.\n"); 1606 return false; 1607 } 1608 1609 // Check if we can vectorize the instructions and CFG in this loop. 1610 if (!canVectorizeInstrs()) { 1611 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 1612 return false; 1613 } 1614 1615 // Go over each instruction and look at memory deps. 1616 if (!canVectorizeMemory()) { 1617 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 1618 return false; 1619 } 1620 1621 // Collect all of the variables that remain uniform after vectorization. 1622 collectLoopUniforms(); 1623 1624 DEBUG(dbgs() << "LV: We can vectorize this loop" << 1625 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 1626 <<"!\n"); 1627 1628 // Okay! We can vectorize. At this point we don't have any other mem analysis 1629 // which may limit our maximum vectorization factor, so just return true with 1630 // no restrictions. 1631 return true; 1632 } 1633 1634 bool LoopVectorizationLegality::canVectorizeInstrs() { 1635 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 1636 BasicBlock *Header = TheLoop->getHeader(); 1637 1638 // For each block in the loop. 1639 for (Loop::block_iterator bb = TheLoop->block_begin(), 1640 be = TheLoop->block_end(); bb != be; ++bb) { 1641 1642 // Scan the instructions in the block and look for hazards. 1643 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 1644 ++it) { 1645 1646 if (PHINode *Phi = dyn_cast<PHINode>(it)) { 1647 // This should not happen because the loop should be normalized. 1648 if (Phi->getNumIncomingValues() != 2) { 1649 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1650 return false; 1651 } 1652 1653 // If this PHINode is not in the header block, then we know that we 1654 // can convert it to select during if-conversion. 1655 if (*bb != Header) 1656 continue; 1657 1658 // This is the value coming from the preheader. 1659 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 1660 1661 // We only look at integer and pointer phi nodes. 1662 if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) { 1663 DEBUG(dbgs() << "LV: Found a pointer induction variable.\n"); 1664 Inductions[Phi] = StartValue; 1665 continue; 1666 } else if (!Phi->getType()->isIntegerTy()) { 1667 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 1668 return false; 1669 } 1670 1671 // Handle integer PHIs: 1672 if (isInductionVariable(Phi)) { 1673 if (Induction) { 1674 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1675 return false; 1676 } 1677 DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); 1678 Induction = Phi; 1679 Inductions[Phi] = StartValue; 1680 continue; 1681 } 1682 if (AddReductionVar(Phi, IntegerAdd)) { 1683 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1684 continue; 1685 } 1686 if (AddReductionVar(Phi, IntegerMult)) { 1687 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1688 continue; 1689 } 1690 if (AddReductionVar(Phi, IntegerOr)) { 1691 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1692 continue; 1693 } 1694 if (AddReductionVar(Phi, IntegerAnd)) { 1695 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1696 continue; 1697 } 1698 if (AddReductionVar(Phi, IntegerXor)) { 1699 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1700 continue; 1701 } 1702 1703 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1704 return false; 1705 }// end of PHI handling 1706 1707 // We still don't handle functions. 1708 CallInst *CI = dyn_cast<CallInst>(it); 1709 if (CI) { 1710 DEBUG(dbgs() << "LV: Found a call site.\n"); 1711 return false; 1712 } 1713 1714 // We do not re-vectorize vectors. 1715 if (!VectorType::isValidElementType(it->getType()) && 1716 !it->getType()->isVoidTy()) { 1717 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1718 return false; 1719 } 1720 1721 // Reduction instructions are allowed to have exit users. 1722 // All other instructions must not have external users. 1723 if (!AllowedExit.count(it)) 1724 //Check that all of the users of the loop are inside the BB. 1725 for (Value::use_iterator I = it->use_begin(), E = it->use_end(); 1726 I != E; ++I) { 1727 Instruction *U = cast<Instruction>(*I); 1728 // This user may be a reduction exit value. 1729 if (!TheLoop->contains(U)) { 1730 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1731 return false; 1732 } 1733 } 1734 } // next instr. 1735 1736 } 1737 1738 if (!Induction) { 1739 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1740 assert(getInductionVars()->size() && "No induction variables"); 1741 } 1742 1743 return true; 1744 } 1745 1746 void LoopVectorizationLegality::collectLoopUniforms() { 1747 // We now know that the loop is vectorizable! 1748 // Collect variables that will remain uniform after vectorization. 1749 std::vector<Value*> Worklist; 1750 BasicBlock *Latch = TheLoop->getLoopLatch(); 1751 1752 // Start with the conditional branch and walk up the block. 1753 Worklist.push_back(Latch->getTerminator()->getOperand(0)); 1754 1755 while (Worklist.size()) { 1756 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1757 Worklist.pop_back(); 1758 1759 // Look at instructions inside this loop. 1760 // Stop when reaching PHI nodes. 1761 // TODO: we need to follow values all over the loop, not only in this block. 1762 if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 1763 continue; 1764 1765 // This is a known uniform. 1766 Uniforms.insert(I); 1767 1768 // Insert all operands. 1769 for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { 1770 Worklist.push_back(I->getOperand(i)); 1771 } 1772 } 1773 } 1774 1775 bool LoopVectorizationLegality::canVectorizeMemory() { 1776 typedef SmallVector<Value*, 16> ValueVector; 1777 typedef SmallPtrSet<Value*, 16> ValueSet; 1778 // Holds the Load and Store *instructions*. 1779 ValueVector Loads; 1780 ValueVector Stores; 1781 PtrRtCheck.Pointers.clear(); 1782 PtrRtCheck.Need = false; 1783 1784 // For each block. 1785 for (Loop::block_iterator bb = TheLoop->block_begin(), 1786 be = TheLoop->block_end(); bb != be; ++bb) { 1787 1788 // Scan the BB and collect legal loads and stores. 1789 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 1790 ++it) { 1791 1792 // If this is a load, save it. If this instruction can read from memory 1793 // but is not a load, then we quit. Notice that we don't handle function 1794 // calls that read or write. 1795 if (it->mayReadFromMemory()) { 1796 LoadInst *Ld = dyn_cast<LoadInst>(it); 1797 if (!Ld) return false; 1798 if (!Ld->isSimple()) { 1799 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1800 return false; 1801 } 1802 Loads.push_back(Ld); 1803 continue; 1804 } 1805 1806 // Save 'store' instructions. Abort if other instructions write to memory. 1807 if (it->mayWriteToMemory()) { 1808 StoreInst *St = dyn_cast<StoreInst>(it); 1809 if (!St) return false; 1810 if (!St->isSimple()) { 1811 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1812 return false; 1813 } 1814 Stores.push_back(St); 1815 } 1816 } // next instr. 1817 } // next block. 1818 1819 // Now we have two lists that hold the loads and the stores. 1820 // Next, we find the pointers that they use. 1821 1822 // Check if we see any stores. If there are no stores, then we don't 1823 // care if the pointers are *restrict*. 1824 if (!Stores.size()) { 1825 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1826 return true; 1827 } 1828 1829 // Holds the read and read-write *pointers* that we find. 1830 ValueVector Reads; 1831 ValueVector ReadWrites; 1832 1833 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1834 // multiple times on the same object. If the ptr is accessed twice, once 1835 // for read and once for write, it will only appear once (on the write 1836 // list). This is okay, since we are going to check for conflicts between 1837 // writes and between reads and writes, but not between reads and reads. 1838 ValueSet Seen; 1839 1840 ValueVector::iterator I, IE; 1841 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1842 StoreInst *ST = dyn_cast<StoreInst>(*I); 1843 assert(ST && "Bad StoreInst"); 1844 Value* Ptr = ST->getPointerOperand(); 1845 1846 if (isUniform(Ptr)) { 1847 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 1848 return false; 1849 } 1850 1851 // If we did *not* see this pointer before, insert it to 1852 // the read-write list. At this phase it is only a 'write' list. 1853 if (Seen.insert(Ptr)) 1854 ReadWrites.push_back(Ptr); 1855 } 1856 1857 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1858 LoadInst *LD = dyn_cast<LoadInst>(*I); 1859 assert(LD && "Bad LoadInst"); 1860 Value* Ptr = LD->getPointerOperand(); 1861 // If we did *not* see this pointer before, insert it to the 1862 // read list. If we *did* see it before, then it is already in 1863 // the read-write list. This allows us to vectorize expressions 1864 // such as A[i] += x; Because the address of A[i] is a read-write 1865 // pointer. This only works if the index of A[i] is consecutive. 1866 // If the address of i is unknown (for example A[B[i]]) then we may 1867 // read a few words, modify, and write a few words, and some of the 1868 // words may be written to the same address. 1869 if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) 1870 Reads.push_back(Ptr); 1871 } 1872 1873 // If we write (or read-write) to a single destination and there are no 1874 // other reads in this loop then is it safe to vectorize. 1875 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1876 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1877 return true; 1878 } 1879 1880 // Find pointers with computable bounds. We are going to use this information 1881 // to place a runtime bound check. 1882 bool RT = true; 1883 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 1884 if (hasComputableBounds(*I)) { 1885 PtrRtCheck.insert(SE, TheLoop, *I); 1886 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1887 } else { 1888 RT = false; 1889 break; 1890 } 1891 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 1892 if (hasComputableBounds(*I)) { 1893 PtrRtCheck.insert(SE, TheLoop, *I); 1894 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1895 } else { 1896 RT = false; 1897 break; 1898 } 1899 1900 // Check that we did not collect too many pointers or found a 1901 // unsizeable pointer. 1902 if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 1903 PtrRtCheck.reset(); 1904 RT = false; 1905 } 1906 1907 PtrRtCheck.Need = RT; 1908 1909 if (RT) { 1910 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 1911 } 1912 1913 // Now that the pointers are in two lists (Reads and ReadWrites), we 1914 // can check that there are no conflicts between each of the writes and 1915 // between the writes to the reads. 1916 ValueSet WriteObjects; 1917 ValueVector TempObjects; 1918 1919 // Check that the read-writes do not conflict with other read-write 1920 // pointers. 1921 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1922 GetUnderlyingObjects(*I, TempObjects, DL); 1923 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1924 it != e; ++it) { 1925 if (!isIdentifiedObject(*it)) { 1926 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1927 return RT; 1928 } 1929 if (!WriteObjects.insert(*it)) { 1930 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1931 << **it <<"\n"); 1932 return RT; 1933 } 1934 } 1935 TempObjects.clear(); 1936 } 1937 1938 /// Check that the reads don't conflict with the read-writes. 1939 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1940 GetUnderlyingObjects(*I, TempObjects, DL); 1941 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1942 it != e; ++it) { 1943 if (!isIdentifiedObject(*it)) { 1944 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1945 return RT; 1946 } 1947 if (WriteObjects.count(*it)) { 1948 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1949 << **it <<"\n"); 1950 return RT; 1951 } 1952 } 1953 TempObjects.clear(); 1954 } 1955 1956 // It is safe to vectorize and we don't need any runtime checks. 1957 DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); 1958 PtrRtCheck.reset(); 1959 return true; 1960 } 1961 1962 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1963 ReductionKind Kind) { 1964 if (Phi->getNumIncomingValues() != 2) 1965 return false; 1966 1967 // Reduction variables are only found in the loop header block. 1968 if (Phi->getParent() != TheLoop->getHeader()) 1969 return false; 1970 1971 // Obtain the reduction start value from the value that comes from the loop 1972 // preheader. 1973 Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); 1974 1975 // ExitInstruction is the single value which is used outside the loop. 1976 // We only allow for a single reduction value to be used outside the loop. 1977 // This includes users of the reduction, variables (which form a cycle 1978 // which ends in the phi node). 1979 Instruction *ExitInstruction = 0; 1980 1981 // Iter is our iterator. We start with the PHI node and scan for all of the 1982 // users of this instruction. All users must be instructions which can be 1983 // used as reduction variables (such as ADD). We may have a single 1984 // out-of-block user. They cycle must end with the original PHI. 1985 // Also, we can't have multiple block-local users. 1986 Instruction *Iter = Phi; 1987 while (true) { 1988 // If the instruction has no users then this is a broken 1989 // chain and can't be a reduction variable. 1990 if (Iter->use_empty()) 1991 return false; 1992 1993 // Any reduction instr must be of one of the allowed kinds. 1994 if (!isReductionInstr(Iter, Kind)) 1995 return false; 1996 1997 // Did we find a user inside this block ? 1998 bool FoundInBlockUser = false; 1999 // Did we reach the initial PHI node ? 2000 bool FoundStartPHI = false; 2001 2002 // For each of the *users* of iter. 2003 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 2004 it != e; ++it) { 2005 Instruction *U = cast<Instruction>(*it); 2006 // We already know that the PHI is a user. 2007 if (U == Phi) { 2008 FoundStartPHI = true; 2009 continue; 2010 } 2011 2012 // Check if we found the exit user. 2013 BasicBlock *Parent = U->getParent(); 2014 if (!TheLoop->contains(Parent)) { 2015 // Exit if you find multiple outside users. 2016 if (ExitInstruction != 0) 2017 return false; 2018 ExitInstruction = Iter; 2019 } 2020 2021 // We allow in-loop PHINodes which are not the original reduction PHI 2022 // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE 2023 // structure) then don't skip this PHI. 2024 if (isa<PHINode>(U) && U->getParent() != TheLoop->getHeader() && 2025 TheLoop->contains(U) && Iter->getNumUses() > 1) 2026 continue; 2027 2028 // We can't have multiple inside users. 2029 if (FoundInBlockUser) 2030 return false; 2031 FoundInBlockUser = true; 2032 Iter = U; 2033 } 2034 2035 // We found a reduction var if we have reached the original 2036 // phi node and we only have a single instruction with out-of-loop 2037 // users. 2038 if (FoundStartPHI && ExitInstruction) { 2039 // This instruction is allowed to have out-of-loop users. 2040 AllowedExit.insert(ExitInstruction); 2041 2042 // Save the description of this reduction variable. 2043 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 2044 Reductions[Phi] = RD; 2045 return true; 2046 } 2047 2048 // If we've reached the start PHI but did not find an outside user then 2049 // this is dead code. Abort. 2050 if (FoundStartPHI) 2051 return false; 2052 } 2053 } 2054 2055 bool 2056 LoopVectorizationLegality::isReductionInstr(Instruction *I, 2057 ReductionKind Kind) { 2058 switch (I->getOpcode()) { 2059 default: 2060 return false; 2061 case Instruction::PHI: 2062 // possibly. 2063 return true; 2064 case Instruction::Add: 2065 case Instruction::Sub: 2066 return Kind == IntegerAdd; 2067 case Instruction::Mul: 2068 return Kind == IntegerMult; 2069 case Instruction::And: 2070 return Kind == IntegerAnd; 2071 case Instruction::Or: 2072 return Kind == IntegerOr; 2073 case Instruction::Xor: 2074 return Kind == IntegerXor; 2075 } 2076 } 2077 2078 bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 2079 Type *PhiTy = Phi->getType(); 2080 // We only handle integer and pointer inductions variables. 2081 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 2082 return false; 2083 2084 // Check that the PHI is consecutive and starts at zero. 2085 const SCEV *PhiScev = SE->getSCEV(Phi); 2086 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 2087 if (!AR) { 2088 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 2089 return false; 2090 } 2091 const SCEV *Step = AR->getStepRecurrence(*SE); 2092 2093 // Integer inductions need to have a stride of one. 2094 if (PhiTy->isIntegerTy()) 2095 return Step->isOne(); 2096 2097 // Calculate the pointer stride and check if it is consecutive. 2098 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 2099 if (!C) return false; 2100 2101 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 2102 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 2103 return (C->getValue()->equalsInt(Size)); 2104 } 2105 2106 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 2107 assert(TheLoop->contains(BB) && "Unknown block used"); 2108 2109 // Blocks that do not dominate the latch need predication. 2110 BasicBlock* Latch = TheLoop->getLoopLatch(); 2111 return !DT->dominates(BB, Latch); 2112 } 2113 2114 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { 2115 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2116 // We don't predicate loads/stores at the moment. 2117 if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) 2118 return false; 2119 2120 // The isntructions below can trap. 2121 switch (it->getOpcode()) { 2122 default: continue; 2123 case Instruction::UDiv: 2124 case Instruction::SDiv: 2125 case Instruction::URem: 2126 case Instruction::SRem: 2127 return false; 2128 } 2129 } 2130 2131 return true; 2132 } 2133 2134 bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 2135 const SCEV *PhiScev = SE->getSCEV(Ptr); 2136 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 2137 if (!AR) 2138 return false; 2139 2140 return AR->isAffine(); 2141 } 2142 2143 unsigned 2144 LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { 2145 if (!VTTI) { 2146 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 2147 return 1; 2148 } 2149 2150 float Cost = expectedCost(1); 2151 unsigned Width = 1; 2152 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 2153 for (unsigned i=2; i <= VF; i*=2) { 2154 // Notice that the vector loop needs to be executed less times, so 2155 // we need to divide the cost of the vector loops by the width of 2156 // the vector elements. 2157 float VectorCost = expectedCost(i) / (float)i; 2158 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 2159 (int)VectorCost << ".\n"); 2160 if (VectorCost < Cost) { 2161 Cost = VectorCost; 2162 Width = i; 2163 } 2164 } 2165 2166 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 2167 return Width; 2168 } 2169 2170 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 2171 unsigned Cost = 0; 2172 2173 // For each block. 2174 for (Loop::block_iterator bb = TheLoop->block_begin(), 2175 be = TheLoop->block_end(); bb != be; ++bb) { 2176 unsigned BlockCost = 0; 2177 BasicBlock *BB = *bb; 2178 2179 // For each instruction in the old loop. 2180 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2181 unsigned C = getInstructionCost(it, VF); 2182 Cost += C; 2183 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << 2184 VF << " For instruction: "<< *it << "\n"); 2185 } 2186 2187 // We assume that if-converted blocks have a 50% chance of being executed. 2188 // When the code is scalar then some of the blocks are avoided due to CF. 2189 // When the code is vectorized we execute all code paths. 2190 if (Legal->blockNeedsPredication(*bb) && VF == 1) 2191 BlockCost /= 2; 2192 2193 Cost += BlockCost; 2194 } 2195 2196 return Cost; 2197 } 2198 2199 unsigned 2200 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 2201 assert(VTTI && "Invalid vector target transformation info"); 2202 2203 // If we know that this instruction will remain uniform, check the cost of 2204 // the scalar version. 2205 if (Legal->isUniformAfterVectorization(I)) 2206 VF = 1; 2207 2208 Type *RetTy = I->getType(); 2209 Type *VectorTy = ToVectorTy(RetTy, VF); 2210 2211 2212 // TODO: We need to estimate the cost of intrinsic calls. 2213 switch (I->getOpcode()) { 2214 case Instruction::GetElementPtr: 2215 // We mark this instruction as zero-cost because scalar GEPs are usually 2216 // lowered to the intruction addressing mode. At the moment we don't 2217 // generate vector geps. 2218 return 0; 2219 case Instruction::Br: { 2220 return VTTI->getCFInstrCost(I->getOpcode()); 2221 } 2222 case Instruction::PHI: 2223 //TODO: IF-converted IFs become selects. 2224 return 0; 2225 case Instruction::Add: 2226 case Instruction::FAdd: 2227 case Instruction::Sub: 2228 case Instruction::FSub: 2229 case Instruction::Mul: 2230 case Instruction::FMul: 2231 case Instruction::UDiv: 2232 case Instruction::SDiv: 2233 case Instruction::FDiv: 2234 case Instruction::URem: 2235 case Instruction::SRem: 2236 case Instruction::FRem: 2237 case Instruction::Shl: 2238 case Instruction::LShr: 2239 case Instruction::AShr: 2240 case Instruction::And: 2241 case Instruction::Or: 2242 case Instruction::Xor: 2243 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 2244 case Instruction::Select: { 2245 SelectInst *SI = cast<SelectInst>(I); 2246 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 2247 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 2248 Type *CondTy = SI->getCondition()->getType(); 2249 if (ScalarCond) 2250 CondTy = VectorType::get(CondTy, VF); 2251 2252 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 2253 } 2254 case Instruction::ICmp: 2255 case Instruction::FCmp: { 2256 Type *ValTy = I->getOperand(0)->getType(); 2257 VectorTy = ToVectorTy(ValTy, VF); 2258 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 2259 } 2260 case Instruction::Store: { 2261 StoreInst *SI = cast<StoreInst>(I); 2262 Type *ValTy = SI->getValueOperand()->getType(); 2263 VectorTy = ToVectorTy(ValTy, VF); 2264 2265 if (VF == 1) 2266 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 2267 SI->getAlignment(), SI->getPointerAddressSpace()); 2268 2269 // Scalarized stores. 2270 if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { 2271 unsigned Cost = 0; 2272 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2273 ValTy); 2274 // The cost of extracting from the value vector. 2275 Cost += VF * (ExtCost); 2276 // The cost of the scalar stores. 2277 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2278 ValTy->getScalarType(), 2279 SI->getAlignment(), 2280 SI->getPointerAddressSpace()); 2281 return Cost; 2282 } 2283 2284 // Wide stores. 2285 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 2286 SI->getPointerAddressSpace()); 2287 } 2288 case Instruction::Load: { 2289 LoadInst *LI = cast<LoadInst>(I); 2290 2291 if (VF == 1) 2292 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 2293 LI->getAlignment(), 2294 LI->getPointerAddressSpace()); 2295 2296 // Scalarized loads. 2297 if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { 2298 unsigned Cost = 0; 2299 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 2300 // The cost of inserting the loaded value into the result vector. 2301 Cost += VF * (InCost); 2302 // The cost of the scalar stores. 2303 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2304 RetTy->getScalarType(), 2305 LI->getAlignment(), 2306 LI->getPointerAddressSpace()); 2307 return Cost; 2308 } 2309 2310 // Wide loads. 2311 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 2312 LI->getPointerAddressSpace()); 2313 } 2314 case Instruction::ZExt: 2315 case Instruction::SExt: 2316 case Instruction::FPToUI: 2317 case Instruction::FPToSI: 2318 case Instruction::FPExt: 2319 case Instruction::PtrToInt: 2320 case Instruction::IntToPtr: 2321 case Instruction::SIToFP: 2322 case Instruction::UIToFP: 2323 case Instruction::Trunc: 2324 case Instruction::FPTrunc: 2325 case Instruction::BitCast: { 2326 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2327 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 2328 } 2329 default: { 2330 // We are scalarizing the instruction. Return the cost of the scalar 2331 // instruction, plus the cost of insert and extract into vector 2332 // elements, times the vector width. 2333 unsigned Cost = 0; 2334 2335 bool IsVoid = RetTy->isVoidTy(); 2336 2337 unsigned InsCost = (IsVoid ? 0 : 2338 VTTI->getInstrCost(Instruction::InsertElement, 2339 VectorTy)); 2340 2341 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2342 VectorTy); 2343 2344 // The cost of inserting the results plus extracting each one of the 2345 // operands. 2346 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 2347 2348 // The cost of executing VF copies of the scalar instruction. 2349 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 2350 return Cost; 2351 } 2352 }// end of switch. 2353 } 2354 2355 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 2356 if (Scalar->isVoidTy() || VF == 1) 2357 return Scalar; 2358 return VectorType::get(Scalar, VF); 2359 } 2360 2361 } // namespace 2362 2363 char LoopVectorize::ID = 0; 2364 static const char lv_name[] = "Loop Vectorization"; 2365 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 2366 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 2367 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 2368 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 2369 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 2370 2371 namespace llvm { 2372 Pass *createLoopVectorizePass() { 2373 return new LoopVectorize(); 2374 } 2375 } 2376