1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11 // and generates target-independent LLVM-IR. Legalization of the IR is done 12 // in the codegen. However, the vectorizes uses (will use) the codegen 13 // interfaces to generate IR that is likely to result in an optimal binary. 14 // 15 // The loop vectorizer combines consecutive loop iteration into a single 16 // 'wide' iteration. After this transformation the index is incremented 17 // by the SIMD vector width, and not by one. 18 // 19 // This pass has three parts: 20 // 1. The main loop pass that drives the different parts. 21 // 2. LoopVectorizationLegality - A unit that checks for the legality 22 // of the vectorization. 23 // 3. InnerLoopVectorizer - A unit that performs the actual 24 // widening of instructions. 25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 26 // of vectorization. It decides on the optimal vector width, which 27 // can be one, if vectorization is not profitable. 28 // 29 //===----------------------------------------------------------------------===// 30 // 31 // The reduction-variable vectorization is based on the paper: 32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33 // 34 // Variable uniformity checks are inspired by: 35 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 36 // 37 // Other ideas/concepts are from: 38 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 39 // 40 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 41 // Vectorizing Compilers. 42 // 43 //===----------------------------------------------------------------------===// 44 #define LV_NAME "loop-vectorize" 45 #define DEBUG_TYPE LV_NAME 46 #include "llvm/Transforms/Vectorize.h" 47 #include "llvm/ADT/SmallVector.h" 48 #include "llvm/ADT/StringExtras.h" 49 #include "llvm/Analysis/AliasAnalysis.h" 50 #include "llvm/Analysis/AliasSetTracker.h" 51 #include "llvm/Analysis/Dominators.h" 52 #include "llvm/Analysis/LoopInfo.h" 53 #include "llvm/Analysis/LoopPass.h" 54 #include "llvm/Analysis/ScalarEvolution.h" 55 #include "llvm/Analysis/ScalarEvolutionExpander.h" 56 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 57 #include "llvm/Analysis/ValueTracking.h" 58 #include "llvm/Analysis/Verifier.h" 59 #include "llvm/Constants.h" 60 #include "llvm/DataLayout.h" 61 #include "llvm/DerivedTypes.h" 62 #include "llvm/Function.h" 63 #include "llvm/Instructions.h" 64 #include "llvm/LLVMContext.h" 65 #include "llvm/Module.h" 66 #include "llvm/Pass.h" 67 #include "llvm/Support/CommandLine.h" 68 #include "llvm/Support/Debug.h" 69 #include "llvm/Support/raw_ostream.h" 70 #include "llvm/TargetTransformInfo.h" 71 #include "llvm/Transforms/Scalar.h" 72 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 73 #include "llvm/Transforms/Utils/Local.h" 74 #include "llvm/Type.h" 75 #include "llvm/Value.h" 76 #include <algorithm> 77 using namespace llvm; 78 79 static cl::opt<unsigned> 80 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 81 cl::desc("Set the default vectorization width. Zero is autoselect.")); 82 83 static cl::opt<bool> 84 EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden, 85 cl::desc("Enable if-conversion during vectorization.")); 86 87 /// We don't vectorize loops with a known constant trip count below this number. 88 const unsigned TinyTripCountThreshold = 16; 89 90 /// When performing a runtime memory check, do not check more than this 91 /// number of pointers. Notice that the check is quadratic! 92 const unsigned RuntimeMemoryCheckThreshold = 2; 93 94 /// This is the highest vector width that we try to generate. 95 const unsigned MaxVectorSize = 8; 96 97 namespace { 98 99 // Forward declarations. 100 class LoopVectorizationLegality; 101 class LoopVectorizationCostModel; 102 103 /// InnerLoopVectorizer vectorizes loops which contain only one basic 104 /// block to a specified vectorization factor (VF). 105 /// This class performs the widening of scalars into vectors, or multiple 106 /// scalars. This class also implements the following features: 107 /// * It inserts an epilogue loop for handling loops that don't have iteration 108 /// counts that are known to be a multiple of the vectorization factor. 109 /// * It handles the code generation for reduction variables. 110 /// * Scalarization (implementation using scalars) of un-vectorizable 111 /// instructions. 112 /// InnerLoopVectorizer does not perform any vectorization-legality 113 /// checks, and relies on the caller to check for the different legality 114 /// aspects. The InnerLoopVectorizer relies on the 115 /// LoopVectorizationLegality class to provide information about the induction 116 /// and reduction variables that were found to a given vectorization factor. 117 class InnerLoopVectorizer { 118 public: 119 /// Ctor. 120 InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, 121 DominatorTree *Dt, DataLayout *Dl, 122 unsigned VecWidth): 123 OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth), 124 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 125 126 // Perform the actual loop widening (vectorization). 127 void vectorize(LoopVectorizationLegality *Legal) { 128 // Create a new empty loop. Unlink the old loop and connect the new one. 129 createEmptyLoop(Legal); 130 // Widen each instruction in the old loop to a new one in the new loop. 131 // Use the Legality module to find the induction and reduction variables. 132 vectorizeLoop(Legal); 133 // Register the new loop and update the analysis passes. 134 updateAnalysis(); 135 } 136 137 private: 138 /// Add code that checks at runtime if the accessed arrays overlap. 139 /// Returns the comperator value or NULL if no check is needed. 140 Value *addRuntimeCheck(LoopVectorizationLegality *Legal, 141 Instruction *Loc); 142 /// Create an empty loop, based on the loop ranges of the old loop. 143 void createEmptyLoop(LoopVectorizationLegality *Legal); 144 /// Copy and widen the instructions from the old loop. 145 void vectorizeLoop(LoopVectorizationLegality *Legal); 146 /// Insert the new loop to the loop hierarchy and pass manager 147 /// and update the analysis passes. 148 void updateAnalysis(); 149 150 /// This instruction is un-vectorizable. Implement it as a sequence 151 /// of scalars. 152 void scalarizeInstruction(Instruction *Instr); 153 154 /// Create a broadcast instruction. This method generates a broadcast 155 /// instruction (shuffle) for loop invariant values and for the induction 156 /// value. If this is the induction variable then we extend it to N, N+1, ... 157 /// this is needed because each iteration in the loop corresponds to a SIMD 158 /// element. 159 Value *getBroadcastInstrs(Value *V); 160 161 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 162 /// for each element in the vector. Starting from zero. 163 Value *getConsecutiveVector(Value* Val); 164 165 /// When we go over instructions in the basic block we rely on previous 166 /// values within the current basic block or on loop invariant values. 167 /// When we widen (vectorize) values we place them in the map. If the values 168 /// are not within the map, they have to be loop invariant, so we simply 169 /// broadcast them into a vector. 170 Value *getVectorValue(Value *V); 171 172 /// Get a uniform vector of constant integers. We use this to get 173 /// vectors of ones and zeros for the reduction code. 174 Constant* getUniformVector(unsigned Val, Type* ScalarTy); 175 176 typedef DenseMap<Value*, Value*> ValueMap; 177 178 /// The original loop. 179 Loop *OrigLoop; 180 // Scev analysis to use. 181 ScalarEvolution *SE; 182 // Loop Info. 183 LoopInfo *LI; 184 // Dominator Tree. 185 DominatorTree *DT; 186 // Data Layout. 187 DataLayout *DL; 188 // The vectorization factor to use. 189 unsigned VF; 190 191 // The builder that we use 192 IRBuilder<> Builder; 193 194 // --- Vectorization state --- 195 196 /// The vector-loop preheader. 197 BasicBlock *LoopVectorPreHeader; 198 /// The scalar-loop preheader. 199 BasicBlock *LoopScalarPreHeader; 200 /// Middle Block between the vector and the scalar. 201 BasicBlock *LoopMiddleBlock; 202 ///The ExitBlock of the scalar loop. 203 BasicBlock *LoopExitBlock; 204 ///The vector loop body. 205 BasicBlock *LoopVectorBody; 206 ///The scalar loop body. 207 BasicBlock *LoopScalarBody; 208 ///The first bypass block. 209 BasicBlock *LoopBypassBlock; 210 211 /// The new Induction variable which was added to the new block. 212 PHINode *Induction; 213 /// The induction variable of the old basic block. 214 PHINode *OldInduction; 215 // Maps scalars to widened vectors. 216 ValueMap WidenMap; 217 }; 218 219 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 220 /// to what vectorization factor. 221 /// This class does not look at the profitability of vectorization, only the 222 /// legality. This class has two main kinds of checks: 223 /// * Memory checks - The code in canVectorizeMemory checks if vectorization 224 /// will change the order of memory accesses in a way that will change the 225 /// correctness of the program. 226 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory 227 /// checks for a number of different conditions, such as the availability of a 228 /// single induction variable, that all types are supported and vectorize-able, 229 /// etc. This code reflects the capabilities of InnerLoopVectorizer. 230 /// This class is also used by InnerLoopVectorizer for identifying 231 /// induction variable and the different reduction variables. 232 class LoopVectorizationLegality { 233 public: 234 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl, 235 DominatorTree *Dt): 236 TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { } 237 238 /// This represents the kinds of reductions that we support. 239 enum ReductionKind { 240 NoReduction, /// Not a reduction. 241 IntegerAdd, /// Sum of numbers. 242 IntegerMult, /// Product of numbers. 243 IntegerOr, /// Bitwise or logical OR of numbers. 244 IntegerAnd, /// Bitwise or logical AND of numbers. 245 IntegerXor /// Bitwise or logical XOR of numbers. 246 }; 247 248 /// This POD struct holds information about reduction variables. 249 struct ReductionDescriptor { 250 // Default C'tor 251 ReductionDescriptor(): 252 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} 253 254 // C'tor. 255 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): 256 StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 257 258 // The starting value of the reduction. 259 // It does not have to be zero! 260 Value *StartValue; 261 // The instruction who's value is used outside the loop. 262 Instruction *LoopExitInstr; 263 // The kind of the reduction. 264 ReductionKind Kind; 265 }; 266 267 // This POD struct holds information about the memory runtime legality 268 // check that a group of pointers do not overlap. 269 struct RuntimePointerCheck { 270 RuntimePointerCheck(): Need(false) {} 271 272 /// Reset the state of the pointer runtime information. 273 void reset() { 274 Need = false; 275 Pointers.clear(); 276 Starts.clear(); 277 Ends.clear(); 278 } 279 280 /// Insert a pointer and calculate the start and end SCEVs. 281 void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) { 282 const SCEV *Sc = SE->getSCEV(Ptr); 283 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 284 assert(AR && "Invalid addrec expression"); 285 const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); 286 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 287 Pointers.push_back(Ptr); 288 Starts.push_back(AR->getStart()); 289 Ends.push_back(ScEnd); 290 } 291 292 /// This flag indicates if we need to add the runtime check. 293 bool Need; 294 /// Holds the pointers that we need to check. 295 SmallVector<Value*, 2> Pointers; 296 /// Holds the pointer value at the beginning of the loop. 297 SmallVector<const SCEV*, 2> Starts; 298 /// Holds the pointer value at the end of the loop. 299 SmallVector<const SCEV*, 2> Ends; 300 }; 301 302 /// ReductionList contains the reduction descriptors for all 303 /// of the reductions that were found in the loop. 304 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 305 306 /// InductionList saves induction variables and maps them to the initial 307 /// value entring the loop. 308 typedef DenseMap<PHINode*, Value*> InductionList; 309 310 /// Returns true if it is legal to vectorize this loop. 311 /// This does not mean that it is profitable to vectorize this 312 /// loop, only that it is legal to do so. 313 bool canVectorize(); 314 315 /// Returns the Induction variable. 316 PHINode *getInduction() {return Induction;} 317 318 /// Returns the reduction variables found in the loop. 319 ReductionList *getReductionVars() { return &Reductions; } 320 321 /// Returns the induction variables found in the loop. 322 InductionList *getInductionVars() { return &Inductions; } 323 324 /// Check if this pointer is consecutive when vectorizing. This happens 325 /// when the last index of the GEP is the induction variable, or that the 326 /// pointer itself is an induction variable. 327 /// This check allows us to vectorize A[idx] into a wide load/store. 328 bool isConsecutivePtr(Value *Ptr); 329 330 /// Returns true if the value V is uniform within the loop. 331 bool isUniform(Value *V); 332 333 /// Returns true if this instruction will remain scalar after vectorization. 334 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} 335 336 /// Returns the information that we collected about runtime memory check. 337 RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; } 338 private: 339 /// Check if a single basic block loop is vectorizable. 340 /// At this point we know that this is a loop with a constant trip count 341 /// and we only need to check individual instructions. 342 bool canVectorizeInstrs(BasicBlock &BB); 343 344 /// When we vectorize loops we may change the order in which 345 /// we read and write from memory. This method checks if it is 346 /// legal to vectorize the code, considering only memory constrains. 347 /// Returns true if BB is vectorizable 348 bool canVectorizeMemory(); 349 350 /// Return true if we can vectorize this loop using the IF-conversion 351 /// transformation. 352 bool canVectorizeWithIfConvert(); 353 354 /// Collect the variables that need to stay uniform after vectorization. 355 void collectLoopUniforms(); 356 357 /// Return true if the block BB needs to be predicated in order for the loop 358 /// to be vectorized. 359 bool blockNeedsPredication(BasicBlock *BB); 360 361 /// return true if all of the instructions in the block can be speculatively 362 /// executed. 363 bool blockCanBePredicated(BasicBlock *BB); 364 365 /// Returns True, if 'Phi' is the kind of reduction variable for type 366 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 367 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 368 /// Returns true if the instruction I can be a reduction variable of type 369 /// 'Kind'. 370 bool isReductionInstr(Instruction *I, ReductionKind Kind); 371 /// Returns True, if 'Phi' is an induction variable. 372 bool isInductionVariable(PHINode *Phi); 373 /// Return true if can compute the address bounds of Ptr within the loop. 374 bool hasComputableBounds(Value *Ptr); 375 376 /// The loop that we evaluate. 377 Loop *TheLoop; 378 /// Scev analysis. 379 ScalarEvolution *SE; 380 /// DataLayout analysis. 381 DataLayout *DL; 382 // Dominators. 383 DominatorTree *DT; 384 385 // --- vectorization state --- // 386 387 /// Holds the integer induction variable. This is the counter of the 388 /// loop. 389 PHINode *Induction; 390 /// Holds the reduction variables. 391 ReductionList Reductions; 392 /// Holds all of the induction variables that we found in the loop. 393 /// Notice that inductions don't need to start at zero and that induction 394 /// variables can be pointers. 395 InductionList Inductions; 396 397 /// Allowed outside users. This holds the reduction 398 /// vars which can be accessed from outside the loop. 399 SmallPtrSet<Value*, 4> AllowedExit; 400 /// This set holds the variables which are known to be uniform after 401 /// vectorization. 402 SmallPtrSet<Instruction*, 4> Uniforms; 403 /// We need to check that all of the pointers in this list are disjoint 404 /// at runtime. 405 RuntimePointerCheck PtrRtCheck; 406 }; 407 408 /// LoopVectorizationCostModel - estimates the expected speedups due to 409 /// vectorization. 410 /// In many cases vectorization is not profitable. This can happen because 411 /// of a number of reasons. In this class we mainly attempt to predict 412 /// the expected speedup/slowdowns due to the supported instruction set. 413 /// We use the VectorTargetTransformInfo to query the different backends 414 /// for the cost of different operations. 415 class LoopVectorizationCostModel { 416 public: 417 /// C'tor. 418 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, 419 LoopVectorizationLegality *Leg, 420 const VectorTargetTransformInfo *Vtti): 421 TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } 422 423 /// Returns the most profitable vectorization factor for the loop that is 424 /// smaller or equal to the VF argument. This method checks every power 425 /// of two up to VF. 426 unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize); 427 428 private: 429 /// Returns the expected execution cost. The unit of the cost does 430 /// not matter because we use the 'cost' units to compare different 431 /// vector widths. The cost that is returned is *not* normalized by 432 /// the factor width. 433 unsigned expectedCost(unsigned VF); 434 435 /// Returns the execution time cost of an instruction for a given vector 436 /// width. Vector width of one means scalar. 437 unsigned getInstructionCost(Instruction *I, unsigned VF); 438 439 /// A helper function for converting Scalar types to vector types. 440 /// If the incoming type is void, we return void. If the VF is 1, we return 441 /// the scalar type. 442 static Type* ToVectorTy(Type *Scalar, unsigned VF); 443 444 /// The loop that we evaluate. 445 Loop *TheLoop; 446 /// Scev analysis. 447 ScalarEvolution *SE; 448 449 /// Vectorization legality. 450 LoopVectorizationLegality *Legal; 451 /// Vector target information. 452 const VectorTargetTransformInfo *VTTI; 453 }; 454 455 struct LoopVectorize : public LoopPass { 456 static char ID; // Pass identification, replacement for typeid 457 458 LoopVectorize() : LoopPass(ID) { 459 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 460 } 461 462 ScalarEvolution *SE; 463 DataLayout *DL; 464 LoopInfo *LI; 465 TargetTransformInfo *TTI; 466 DominatorTree *DT; 467 468 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 469 // We only vectorize innermost loops. 470 if (!L->empty()) 471 return false; 472 473 SE = &getAnalysis<ScalarEvolution>(); 474 DL = getAnalysisIfAvailable<DataLayout>(); 475 LI = &getAnalysis<LoopInfo>(); 476 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 477 DT = &getAnalysis<DominatorTree>(); 478 479 DEBUG(dbgs() << "LV: Checking a loop in \"" << 480 L->getHeader()->getParent()->getName() << "\"\n"); 481 482 // Check if it is legal to vectorize the loop. 483 LoopVectorizationLegality LVL(L, SE, DL, DT); 484 if (!LVL.canVectorize()) { 485 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 486 return false; 487 } 488 489 // Select the preffered vectorization factor. 490 unsigned VF = 1; 491 if (VectorizationFactor == 0) { 492 const VectorTargetTransformInfo *VTTI = 0; 493 if (TTI) 494 VTTI = TTI->getVectorTargetTransformInfo(); 495 // Use the cost model. 496 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 497 VF = CM.findBestVectorizationFactor(); 498 499 if (VF == 1) { 500 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 501 return false; 502 } 503 504 } else { 505 // Use the user command flag. 506 VF = VectorizationFactor; 507 } 508 509 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 510 L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< 511 "\n"); 512 513 // If we decided that it is *legal* to vectorizer the loop then do it. 514 InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF); 515 LB.vectorize(&LVL); 516 517 DEBUG(verifyFunction(*L->getHeader()->getParent())); 518 return true; 519 } 520 521 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 522 LoopPass::getAnalysisUsage(AU); 523 AU.addRequiredID(LoopSimplifyID); 524 AU.addRequiredID(LCSSAID); 525 AU.addRequired<LoopInfo>(); 526 AU.addRequired<ScalarEvolution>(); 527 AU.addRequired<DominatorTree>(); 528 AU.addPreserved<LoopInfo>(); 529 AU.addPreserved<DominatorTree>(); 530 } 531 532 }; 533 534 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 535 // Create the types. 536 LLVMContext &C = V->getContext(); 537 Type *VTy = VectorType::get(V->getType(), VF); 538 Type *I32 = IntegerType::getInt32Ty(C); 539 540 // Save the current insertion location. 541 Instruction *Loc = Builder.GetInsertPoint(); 542 543 // We need to place the broadcast of invariant variables outside the loop. 544 bool Invariant = (OrigLoop->isLoopInvariant(V) && V != Induction); 545 546 // Place the code for broadcasting invariant variables in the new preheader. 547 if (Invariant) 548 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 549 550 Constant *Zero = ConstantInt::get(I32, 0); 551 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 552 Value *UndefVal = UndefValue::get(VTy); 553 // Insert the value into a new vector. 554 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 555 // Broadcast the scalar into all locations in the vector. 556 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 557 "broadcast"); 558 559 // Restore the builder insertion point. 560 if (Invariant) 561 Builder.SetInsertPoint(Loc); 562 563 return Shuf; 564 } 565 566 Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val) { 567 assert(Val->getType()->isVectorTy() && "Must be a vector"); 568 assert(Val->getType()->getScalarType()->isIntegerTy() && 569 "Elem must be an integer"); 570 // Create the types. 571 Type *ITy = Val->getType()->getScalarType(); 572 VectorType *Ty = cast<VectorType>(Val->getType()); 573 unsigned VLen = Ty->getNumElements(); 574 SmallVector<Constant*, 8> Indices; 575 576 // Create a vector of consecutive numbers from zero to VF. 577 for (unsigned i = 0; i < VLen; ++i) 578 Indices.push_back(ConstantInt::get(ITy, i)); 579 580 // Add the consecutive indices to the vector value. 581 Constant *Cv = ConstantVector::get(Indices); 582 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 583 return Builder.CreateAdd(Val, Cv, "induction"); 584 } 585 586 bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 587 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 588 589 // If this pointer is an induction variable, return it. 590 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 591 if (Phi && getInductionVars()->count(Phi)) 592 return true; 593 594 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 595 if (!Gep) 596 return false; 597 598 unsigned NumOperands = Gep->getNumOperands(); 599 Value *LastIndex = Gep->getOperand(NumOperands - 1); 600 601 // Check that all of the gep indices are uniform except for the last. 602 for (unsigned i = 0; i < NumOperands - 1; ++i) 603 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 604 return false; 605 606 // We can emit wide load/stores only if the last index is the induction 607 // variable. 608 const SCEV *Last = SE->getSCEV(LastIndex); 609 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 610 const SCEV *Step = AR->getStepRecurrence(*SE); 611 612 // The memory is consecutive because the last index is consecutive 613 // and all other indices are loop invariant. 614 if (Step->isOne()) 615 return true; 616 } 617 618 return false; 619 } 620 621 bool LoopVectorizationLegality::isUniform(Value *V) { 622 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 623 } 624 625 Value *InnerLoopVectorizer::getVectorValue(Value *V) { 626 assert(V != Induction && "The new induction variable should not be used."); 627 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 628 // If we saved a vectorized copy of V, use it. 629 Value *&MapEntry = WidenMap[V]; 630 if (MapEntry) 631 return MapEntry; 632 633 // Broadcast V and save the value for future uses. 634 Value *B = getBroadcastInstrs(V); 635 MapEntry = B; 636 return B; 637 } 638 639 Constant* 640 InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 641 return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); 642 } 643 644 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 645 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 646 // Holds vector parameters or scalars, in case of uniform vals. 647 SmallVector<Value*, 8> Params; 648 649 // Find all of the vectorized parameters. 650 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 651 Value *SrcOp = Instr->getOperand(op); 652 653 // If we are accessing the old induction variable, use the new one. 654 if (SrcOp == OldInduction) { 655 Params.push_back(getVectorValue(SrcOp)); 656 continue; 657 } 658 659 // Try using previously calculated values. 660 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 661 662 // If the src is an instruction that appeared earlier in the basic block 663 // then it should already be vectorized. 664 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 665 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 666 // The parameter is a vector value from earlier. 667 Params.push_back(WidenMap[SrcInst]); 668 } else { 669 // The parameter is a scalar from outside the loop. Maybe even a constant. 670 Params.push_back(SrcOp); 671 } 672 } 673 674 assert(Params.size() == Instr->getNumOperands() && 675 "Invalid number of operands"); 676 677 // Does this instruction return a value ? 678 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 679 Value *VecResults = 0; 680 681 // If we have a return value, create an empty vector. We place the scalarized 682 // instructions in this vector. 683 if (!IsVoidRetTy) 684 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 685 686 // For each scalar that we create: 687 for (unsigned i = 0; i < VF; ++i) { 688 Instruction *Cloned = Instr->clone(); 689 if (!IsVoidRetTy) 690 Cloned->setName(Instr->getName() + ".cloned"); 691 // Replace the operands of the cloned instrucions with extracted scalars. 692 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 693 Value *Op = Params[op]; 694 // Param is a vector. Need to extract the right lane. 695 if (Op->getType()->isVectorTy()) 696 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 697 Cloned->setOperand(op, Op); 698 } 699 700 // Place the cloned scalar in the new loop. 701 Builder.Insert(Cloned); 702 703 // If the original scalar returns a value we need to place it in a vector 704 // so that future users will be able to use it. 705 if (!IsVoidRetTy) 706 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 707 Builder.getInt32(i)); 708 } 709 710 if (!IsVoidRetTy) 711 WidenMap[Instr] = VecResults; 712 } 713 714 Value* 715 InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 716 Instruction *Loc) { 717 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 718 Legal->getRuntimePointerCheck(); 719 720 if (!PtrRtCheck->Need) 721 return NULL; 722 723 Value *MemoryRuntimeCheck = 0; 724 unsigned NumPointers = PtrRtCheck->Pointers.size(); 725 SmallVector<Value* , 2> Starts; 726 SmallVector<Value* , 2> Ends; 727 728 SCEVExpander Exp(*SE, "induction"); 729 730 // Use this type for pointer arithmetic. 731 Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); 732 733 for (unsigned i = 0; i < NumPointers; ++i) { 734 Value *Ptr = PtrRtCheck->Pointers[i]; 735 const SCEV *Sc = SE->getSCEV(Ptr); 736 737 if (SE->isLoopInvariant(Sc, OrigLoop)) { 738 DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << 739 *Ptr <<"\n"); 740 Starts.push_back(Ptr); 741 Ends.push_back(Ptr); 742 } else { 743 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 744 745 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], 746 PtrArithTy, Loc); 747 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 748 Starts.push_back(Start); 749 Ends.push_back(End); 750 } 751 } 752 753 for (unsigned i = 0; i < NumPointers; ++i) { 754 for (unsigned j = i+1; j < NumPointers; ++j) { 755 Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 756 Starts[i], Ends[j], "bound0", Loc); 757 Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 758 Starts[j], Ends[i], "bound1", Loc); 759 Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, 760 "found.conflict", Loc); 761 if (MemoryRuntimeCheck) 762 MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, 763 MemoryRuntimeCheck, 764 IsConflict, 765 "conflict.rdx", Loc); 766 else 767 MemoryRuntimeCheck = IsConflict; 768 769 } 770 } 771 772 return MemoryRuntimeCheck; 773 } 774 775 void 776 InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 777 /* 778 In this function we generate a new loop. The new loop will contain 779 the vectorized instructions while the old loop will continue to run the 780 scalar remainder. 781 782 [ ] <-- vector loop bypass. 783 / | 784 / v 785 | [ ] <-- vector pre header. 786 | | 787 | v 788 | [ ] \ 789 | [ ]_| <-- vector loop. 790 | | 791 \ v 792 >[ ] <--- middle-block. 793 / | 794 / v 795 | [ ] <--- new preheader. 796 | | 797 | v 798 | [ ] \ 799 | [ ]_| <-- old scalar loop to handle remainder. 800 \ | 801 \ v 802 >[ ] <-- exit block. 803 ... 804 */ 805 806 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 807 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 808 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 809 assert(ExitBlock && "Must have an exit block"); 810 811 // Some loops have a single integer induction variable, while other loops 812 // don't. One example is c++ iterators that often have multiple pointer 813 // induction variables. In the code below we also support a case where we 814 // don't have a single induction variable. 815 OldInduction = Legal->getInduction(); 816 Type *IdxTy = OldInduction ? OldInduction->getType() : 817 DL->getIntPtrType(SE->getContext()); 818 819 // Find the loop boundaries. 820 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 821 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 822 823 // Get the total trip count from the count by adding 1. 824 ExitCount = SE->getAddExpr(ExitCount, 825 SE->getConstant(ExitCount->getType(), 1)); 826 827 // Expand the trip count and place the new instructions in the preheader. 828 // Notice that the pre-header does not change, only the loop body. 829 SCEVExpander Exp(*SE, "induction"); 830 831 // Count holds the overall loop count (N). 832 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 833 BypassBlock->getTerminator()); 834 835 // The loop index does not have to start at Zero. Find the original start 836 // value from the induction PHI node. If we don't have an induction variable 837 // then we know that it starts at zero. 838 Value *StartIdx = OldInduction ? 839 OldInduction->getIncomingValueForBlock(BypassBlock): 840 ConstantInt::get(IdxTy, 0); 841 842 assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); 843 assert(BypassBlock && "Invalid loop structure"); 844 845 // Generate the code that checks in runtime if arrays overlap. 846 Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, 847 BypassBlock->getTerminator()); 848 849 // Split the single block loop into the two loop structure described above. 850 BasicBlock *VectorPH = 851 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 852 BasicBlock *VecBody = 853 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 854 BasicBlock *MiddleBlock = 855 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 856 BasicBlock *ScalarPH = 857 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 858 859 // This is the location in which we add all of the logic for bypassing 860 // the new vector loop. 861 Instruction *Loc = BypassBlock->getTerminator(); 862 863 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 864 // inside the loop. 865 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 866 867 // Generate the induction variable. 868 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 869 Constant *Step = ConstantInt::get(IdxTy, VF); 870 871 // We may need to extend the index in case there is a type mismatch. 872 // We know that the count starts at zero and does not overflow. 873 if (Count->getType() != IdxTy) { 874 // The exit count can be of pointer type. Convert it to the correct 875 // integer type. 876 if (ExitCount->getType()->isPointerTy()) 877 Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); 878 else 879 Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); 880 } 881 882 // Add the start index to the loop count to get the new end index. 883 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 884 885 // Now we need to generate the expression for N - (N % VF), which is 886 // the part that the vectorized body will execute. 887 Constant *CIVF = ConstantInt::get(IdxTy, VF); 888 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 889 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 890 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 891 "end.idx.rnd.down", Loc); 892 893 // Now, compare the new count to zero. If it is zero skip the vector loop and 894 // jump to the scalar loop. 895 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 896 IdxEndRoundDown, 897 StartIdx, 898 "cmp.zero", Loc); 899 900 // If we are using memory runtime checks, include them in. 901 if (MemoryRuntimeCheck) 902 Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, 903 "CntOrMem", Loc); 904 905 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 906 // Remove the old terminator. 907 Loc->eraseFromParent(); 908 909 // We are going to resume the execution of the scalar loop. 910 // Go over all of the induction variables that we found and fix the 911 // PHIs that are left in the scalar version of the loop. 912 // The starting values of PHI nodes depend on the counter of the last 913 // iteration in the vectorized loop. 914 // If we come from a bypass edge then we need to start from the original start 915 // value. 916 917 // This variable saves the new starting index for the scalar loop. 918 PHINode *ResumeIndex = 0; 919 LoopVectorizationLegality::InductionList::iterator I, E; 920 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 921 for (I = List->begin(), E = List->end(); I != E; ++I) { 922 PHINode *OrigPhi = I->first; 923 PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", 924 MiddleBlock->getTerminator()); 925 Value *EndValue = 0; 926 if (OrigPhi->getType()->isIntegerTy()) { 927 // Handle the integer induction counter: 928 assert(OrigPhi == OldInduction && "Unknown integer PHI"); 929 // We know what the end value is. 930 EndValue = IdxEndRoundDown; 931 // We also know which PHI node holds it. 932 ResumeIndex = ResumeVal; 933 } else { 934 // For pointer induction variables, calculate the offset using 935 // the end index. 936 EndValue = GetElementPtrInst::Create(I->second, CountRoundDown, 937 "ptr.ind.end", 938 BypassBlock->getTerminator()); 939 } 940 941 // The new PHI merges the original incoming value, in case of a bypass, 942 // or the value at the end of the vectorized loop. 943 ResumeVal->addIncoming(I->second, BypassBlock); 944 ResumeVal->addIncoming(EndValue, VecBody); 945 946 // Fix the scalar body counter (PHI node). 947 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 948 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 949 } 950 951 // If we are generating a new induction variable then we also need to 952 // generate the code that calculates the exit value. This value is not 953 // simply the end of the counter because we may skip the vectorized body 954 // in case of a runtime check. 955 if (!OldInduction){ 956 assert(!ResumeIndex && "Unexpected resume value found"); 957 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 958 MiddleBlock->getTerminator()); 959 ResumeIndex->addIncoming(StartIdx, BypassBlock); 960 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 961 } 962 963 // Make sure that we found the index where scalar loop needs to continue. 964 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 965 "Invalid resume Index"); 966 967 // Add a check in the middle block to see if we have completed 968 // all of the iterations in the first vector loop. 969 // If (N - N%VF) == N, then we *don't* need to run the remainder. 970 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 971 ResumeIndex, "cmp.n", 972 MiddleBlock->getTerminator()); 973 974 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 975 // Remove the old terminator. 976 MiddleBlock->getTerminator()->eraseFromParent(); 977 978 // Create i+1 and fill the PHINode. 979 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 980 Induction->addIncoming(StartIdx, VectorPH); 981 Induction->addIncoming(NextIdx, VecBody); 982 // Create the compare. 983 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 984 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 985 986 // Now we have two terminators. Remove the old one from the block. 987 VecBody->getTerminator()->eraseFromParent(); 988 989 // Get ready to start creating new instructions into the vectorized body. 990 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 991 992 // Create and register the new vector loop. 993 Loop* Lp = new Loop(); 994 Loop *ParentLoop = OrigLoop->getParentLoop(); 995 996 // Insert the new loop into the loop nest and register the new basic blocks. 997 if (ParentLoop) { 998 ParentLoop->addChildLoop(Lp); 999 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 1000 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 1001 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 1002 } else { 1003 LI->addTopLevelLoop(Lp); 1004 } 1005 1006 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 1007 1008 // Save the state. 1009 LoopVectorPreHeader = VectorPH; 1010 LoopScalarPreHeader = ScalarPH; 1011 LoopMiddleBlock = MiddleBlock; 1012 LoopExitBlock = ExitBlock; 1013 LoopVectorBody = VecBody; 1014 LoopScalarBody = OldBasicBlock; 1015 LoopBypassBlock = BypassBlock; 1016 } 1017 1018 /// This function returns the identity element (or neutral element) for 1019 /// the operation K. 1020 static unsigned 1021 getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 1022 switch (K) { 1023 case LoopVectorizationLegality::IntegerXor: 1024 case LoopVectorizationLegality::IntegerAdd: 1025 case LoopVectorizationLegality::IntegerOr: 1026 // Adding, Xoring, Oring zero to a number does not change it. 1027 return 0; 1028 case LoopVectorizationLegality::IntegerMult: 1029 // Multiplying a number by 1 does not change it. 1030 return 1; 1031 case LoopVectorizationLegality::IntegerAnd: 1032 // AND-ing a number with an all-1 value does not change it. 1033 return -1; 1034 default: 1035 llvm_unreachable("Unknown reduction kind"); 1036 } 1037 } 1038 1039 void 1040 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 1041 //===------------------------------------------------===// 1042 // 1043 // Notice: any optimization or new instruction that go 1044 // into the code below should be also be implemented in 1045 // the cost-model. 1046 // 1047 //===------------------------------------------------===// 1048 typedef SmallVector<PHINode*, 4> PhiVector; 1049 BasicBlock &BB = *OrigLoop->getHeader(); 1050 Constant *Zero = ConstantInt::get( 1051 IntegerType::getInt32Ty(BB.getContext()), 0); 1052 1053 // In order to support reduction variables we need to be able to vectorize 1054 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 1055 // stages. First, we create a new vector PHI node with no incoming edges. 1056 // We use this value when we vectorize all of the instructions that use the 1057 // PHI. Next, after all of the instructions in the block are complete we 1058 // add the new incoming edges to the PHI. At this point all of the 1059 // instructions in the basic block are vectorized, so we can use them to 1060 // construct the PHI. 1061 PhiVector RdxPHIsToFix; 1062 1063 // For each instruction in the old loop. 1064 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1065 Instruction *Inst = it; 1066 1067 switch (Inst->getOpcode()) { 1068 case Instruction::Br: 1069 // Nothing to do for PHIs and BR, since we already took care of the 1070 // loop control flow instructions. 1071 continue; 1072 case Instruction::PHI:{ 1073 PHINode* P = cast<PHINode>(Inst); 1074 // Handle reduction variables: 1075 if (Legal->getReductionVars()->count(P)) { 1076 // This is phase one of vectorizing PHIs. 1077 Type *VecTy = VectorType::get(Inst->getType(), VF); 1078 WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi", 1079 LoopVectorBody->getFirstInsertionPt()); 1080 RdxPHIsToFix.push_back(P); 1081 continue; 1082 } 1083 1084 // This PHINode must be an induction variable. 1085 // Make sure that we know about it. 1086 assert(Legal->getInductionVars()->count(P) && 1087 "Not an induction variable"); 1088 1089 if (P->getType()->isIntegerTy()) { 1090 assert(P == OldInduction && "Unexpected PHI"); 1091 Value *Broadcasted = getBroadcastInstrs(Induction); 1092 // After broadcasting the induction variable we need to make the 1093 // vector consecutive by adding 0, 1, 2 ... 1094 Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); 1095 1096 WidenMap[OldInduction] = ConsecutiveInduction; 1097 continue; 1098 } 1099 1100 // Handle pointer inductions. 1101 assert(P->getType()->isPointerTy() && "Unexpected type."); 1102 Value *StartIdx = OldInduction ? 1103 Legal->getInductionVars()->lookup(OldInduction) : 1104 ConstantInt::get(Induction->getType(), 0); 1105 1106 // This is the pointer value coming into the loop. 1107 Value *StartPtr = Legal->getInductionVars()->lookup(P); 1108 1109 // This is the normalized GEP that starts counting at zero. 1110 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 1111 "normalized.idx"); 1112 1113 // This is the vector of results. Notice that we don't generate vector 1114 // geps because scalar geps result in better code. 1115 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 1116 for (unsigned int i = 0; i < VF; ++i) { 1117 Constant *Idx = ConstantInt::get(Induction->getType(), i); 1118 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); 1119 Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep"); 1120 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 1121 Builder.getInt32(i), 1122 "insert.gep"); 1123 } 1124 1125 WidenMap[Inst] = VecVal; 1126 continue; 1127 } 1128 case Instruction::Add: 1129 case Instruction::FAdd: 1130 case Instruction::Sub: 1131 case Instruction::FSub: 1132 case Instruction::Mul: 1133 case Instruction::FMul: 1134 case Instruction::UDiv: 1135 case Instruction::SDiv: 1136 case Instruction::FDiv: 1137 case Instruction::URem: 1138 case Instruction::SRem: 1139 case Instruction::FRem: 1140 case Instruction::Shl: 1141 case Instruction::LShr: 1142 case Instruction::AShr: 1143 case Instruction::And: 1144 case Instruction::Or: 1145 case Instruction::Xor: { 1146 // Just widen binops. 1147 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 1148 Value *A = getVectorValue(Inst->getOperand(0)); 1149 Value *B = getVectorValue(Inst->getOperand(1)); 1150 1151 // Use this vector value for all users of the original instruction. 1152 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 1153 WidenMap[Inst] = V; 1154 1155 // Update the NSW, NUW and Exact flags. 1156 BinaryOperator *VecOp = cast<BinaryOperator>(V); 1157 if (isa<OverflowingBinaryOperator>(BinOp)) { 1158 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 1159 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 1160 } 1161 if (isa<PossiblyExactOperator>(VecOp)) 1162 VecOp->setIsExact(BinOp->isExact()); 1163 break; 1164 } 1165 case Instruction::Select: { 1166 // Widen selects. 1167 // If the selector is loop invariant we can create a select 1168 // instruction with a scalar condition. Otherwise, use vector-select. 1169 Value *Cond = Inst->getOperand(0); 1170 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 1171 1172 // The condition can be loop invariant but still defined inside the 1173 // loop. This means that we can't just use the original 'cond' value. 1174 // We have to take the 'vectorized' value and pick the first lane. 1175 // Instcombine will make this a no-op. 1176 Cond = getVectorValue(Cond); 1177 if (InvariantCond) 1178 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 1179 1180 Value *Op0 = getVectorValue(Inst->getOperand(1)); 1181 Value *Op1 = getVectorValue(Inst->getOperand(2)); 1182 WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); 1183 break; 1184 } 1185 1186 case Instruction::ICmp: 1187 case Instruction::FCmp: { 1188 // Widen compares. Generate vector compares. 1189 bool FCmp = (Inst->getOpcode() == Instruction::FCmp); 1190 CmpInst *Cmp = dyn_cast<CmpInst>(Inst); 1191 Value *A = getVectorValue(Inst->getOperand(0)); 1192 Value *B = getVectorValue(Inst->getOperand(1)); 1193 if (FCmp) 1194 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 1195 else 1196 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 1197 break; 1198 } 1199 1200 case Instruction::Store: { 1201 // Attempt to issue a wide store. 1202 StoreInst *SI = dyn_cast<StoreInst>(Inst); 1203 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 1204 Value *Ptr = SI->getPointerOperand(); 1205 unsigned Alignment = SI->getAlignment(); 1206 1207 assert(!Legal->isUniform(Ptr) && 1208 "We do not allow storing to uniform addresses"); 1209 1210 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1211 1212 // This store does not use GEPs. 1213 if (!Legal->isConsecutivePtr(Ptr)) { 1214 scalarizeInstruction(Inst); 1215 break; 1216 } 1217 1218 if (Gep) { 1219 // The last index does not have to be the induction. It can be 1220 // consecutive and be a function of the index. For example A[I+1]; 1221 unsigned NumOperands = Gep->getNumOperands(); 1222 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 1223 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1224 1225 // Create the new GEP with the new induction variable. 1226 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1227 Gep2->setOperand(NumOperands - 1, LastIndex); 1228 Ptr = Builder.Insert(Gep2); 1229 } else { 1230 // Use the induction element ptr. 1231 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1232 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1233 } 1234 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 1235 Value *Val = getVectorValue(SI->getValueOperand()); 1236 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 1237 break; 1238 } 1239 case Instruction::Load: { 1240 // Attempt to issue a wide load. 1241 LoadInst *LI = dyn_cast<LoadInst>(Inst); 1242 Type *RetTy = VectorType::get(LI->getType(), VF); 1243 Value *Ptr = LI->getPointerOperand(); 1244 unsigned Alignment = LI->getAlignment(); 1245 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1246 1247 // If the pointer is loop invariant or if it is non consecutive, 1248 // scalarize the load. 1249 bool Con = Legal->isConsecutivePtr(Ptr); 1250 if (Legal->isUniform(Ptr) || !Con) { 1251 scalarizeInstruction(Inst); 1252 break; 1253 } 1254 1255 if (Gep) { 1256 // The last index does not have to be the induction. It can be 1257 // consecutive and be a function of the index. For example A[I+1]; 1258 unsigned NumOperands = Gep->getNumOperands(); 1259 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 1260 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1261 1262 // Create the new GEP with the new induction variable. 1263 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1264 Gep2->setOperand(NumOperands - 1, LastIndex); 1265 Ptr = Builder.Insert(Gep2); 1266 } else { 1267 // Use the induction element ptr. 1268 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1269 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1270 } 1271 1272 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 1273 LI = Builder.CreateLoad(Ptr); 1274 LI->setAlignment(Alignment); 1275 // Use this vector value for all users of the load. 1276 WidenMap[Inst] = LI; 1277 break; 1278 } 1279 case Instruction::ZExt: 1280 case Instruction::SExt: 1281 case Instruction::FPToUI: 1282 case Instruction::FPToSI: 1283 case Instruction::FPExt: 1284 case Instruction::PtrToInt: 1285 case Instruction::IntToPtr: 1286 case Instruction::SIToFP: 1287 case Instruction::UIToFP: 1288 case Instruction::Trunc: 1289 case Instruction::FPTrunc: 1290 case Instruction::BitCast: { 1291 /// Vectorize bitcasts. 1292 CastInst *CI = dyn_cast<CastInst>(Inst); 1293 Value *A = getVectorValue(Inst->getOperand(0)); 1294 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1295 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 1296 break; 1297 } 1298 1299 default: 1300 /// All other instructions are unsupported. Scalarize them. 1301 scalarizeInstruction(Inst); 1302 break; 1303 }// end of switch. 1304 }// end of for_each instr. 1305 1306 // At this point every instruction in the original loop is widended to 1307 // a vector form. We are almost done. Now, we need to fix the PHI nodes 1308 // that we vectorized. The PHI nodes are currently empty because we did 1309 // not want to introduce cycles. Notice that the remaining PHI nodes 1310 // that we need to fix are reduction variables. 1311 1312 // Create the 'reduced' values for each of the induction vars. 1313 // The reduced values are the vector values that we scalarize and combine 1314 // after the loop is finished. 1315 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 1316 it != e; ++it) { 1317 PHINode *RdxPhi = *it; 1318 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 1319 assert(RdxPhi && "Unable to recover vectorized PHI"); 1320 1321 // Find the reduction variable descriptor. 1322 assert(Legal->getReductionVars()->count(RdxPhi) && 1323 "Unable to find the reduction variable"); 1324 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1325 (*Legal->getReductionVars())[RdxPhi]; 1326 1327 // We need to generate a reduction vector from the incoming scalar. 1328 // To do so, we need to generate the 'identity' vector and overide 1329 // one of the elements with the incoming scalar reduction. We need 1330 // to do it in the vector-loop preheader. 1331 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 1332 1333 // This is the vector-clone of the value that leaves the loop. 1334 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1335 Type *VecTy = VectorExit->getType(); 1336 1337 // Find the reduction identity variable. Zero for addition, or, xor, 1338 // one for multiplication, -1 for And. 1339 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 1340 VecTy->getScalarType()); 1341 1342 // This vector is the Identity vector where the first element is the 1343 // incoming scalar reduction. 1344 Value *VectorStart = Builder.CreateInsertElement(Identity, 1345 RdxDesc.StartValue, Zero); 1346 1347 // Fix the vector-loop phi. 1348 // We created the induction variable so we know that the 1349 // preheader is the first entry. 1350 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1351 1352 // Reductions do not have to start at zero. They can start with 1353 // any loop invariant values. 1354 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 1355 unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1356 Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); 1357 VecRdxPhi->addIncoming(Val, LoopVectorBody); 1358 1359 // Before each round, move the insertion point right between 1360 // the PHIs and the values we are going to write. 1361 // This allows us to write both PHINodes and the extractelement 1362 // instructions. 1363 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1364 1365 // This PHINode contains the vectorized reduction variable, or 1366 // the initial value vector, if we bypass the vector loop. 1367 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1368 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 1369 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 1370 1371 // Extract the first scalar. 1372 Value *Scalar0 = 1373 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); 1374 // Extract and reduce the remaining vector elements. 1375 for (unsigned i=1; i < VF; ++i) { 1376 Value *Scalar1 = 1377 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); 1378 switch (RdxDesc.Kind) { 1379 case LoopVectorizationLegality::IntegerAdd: 1380 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); 1381 break; 1382 case LoopVectorizationLegality::IntegerMult: 1383 Scalar0 = Builder.CreateMul(Scalar0, Scalar1); 1384 break; 1385 case LoopVectorizationLegality::IntegerOr: 1386 Scalar0 = Builder.CreateOr(Scalar0, Scalar1); 1387 break; 1388 case LoopVectorizationLegality::IntegerAnd: 1389 Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); 1390 break; 1391 case LoopVectorizationLegality::IntegerXor: 1392 Scalar0 = Builder.CreateXor(Scalar0, Scalar1); 1393 break; 1394 default: 1395 llvm_unreachable("Unknown reduction operation"); 1396 } 1397 } 1398 1399 // Now, we need to fix the users of the reduction variable 1400 // inside and outside of the scalar remainder loop. 1401 // We know that the loop is in LCSSA form. We need to update the 1402 // PHI nodes in the exit blocks. 1403 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1404 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1405 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1406 if (!LCSSAPhi) continue; 1407 1408 // All PHINodes need to have a single entry edge, or two if 1409 // we already fixed them. 1410 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1411 1412 // We found our reduction value exit-PHI. Update it with the 1413 // incoming bypass edge. 1414 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1415 // Add an edge coming from the bypass. 1416 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1417 break; 1418 } 1419 }// end of the LCSSA phi scan. 1420 1421 // Fix the scalar loop reduction variable with the incoming reduction sum 1422 // from the vector body and from the backedge value. 1423 int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1424 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. 1425 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1426 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1427 }// end of for each redux variable. 1428 } 1429 1430 void InnerLoopVectorizer::updateAnalysis() { 1431 // Forget the original basic block. 1432 SE->forgetLoop(OrigLoop); 1433 1434 // Update the dominator tree information. 1435 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1436 "Entry does not dominate exit."); 1437 1438 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1439 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1440 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1441 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1442 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1443 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1444 1445 DEBUG(DT->verifyAnalysis()); 1446 } 1447 1448 1449 bool LoopVectorizationLegality::canVectorizeWithIfConvert() { 1450 if (!EnableIfConversion) 1451 return false; 1452 1453 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 1454 std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); 1455 1456 // Collect the blocks that need predication. 1457 for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { 1458 BasicBlock *BB = LoopBlocks[i]; 1459 1460 // We must have at most two predecessors because we need to convert 1461 // all PHIs to selects. 1462 unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); 1463 if (Preds > 2) 1464 return false; 1465 1466 // We must be able to predicate all blocks that needs to be predicated. 1467 if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) 1468 return false; 1469 } 1470 1471 // We can if-convert this loop. 1472 return true; 1473 } 1474 1475 bool LoopVectorizationLegality::canVectorize() { 1476 assert(TheLoop->getLoopPreheader() && "No preheader!!"); 1477 1478 // We can only vectorize innermost loops. 1479 if (TheLoop->getSubLoopsVector().size()) 1480 return false; 1481 1482 // We must have a single backedge. 1483 if (TheLoop->getNumBackEdges() != 1) 1484 return false; 1485 1486 // We must have a single exiting block. 1487 if (!TheLoop->getExitingBlock()) 1488 return false; 1489 1490 unsigned NumBlocks = TheLoop->getNumBlocks(); 1491 1492 // Check if we can if-convert non single-bb loops. 1493 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 1494 DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 1495 return false; 1496 } 1497 1498 // We need to have a loop header. 1499 BasicBlock *Header = TheLoop->getHeader(); 1500 BasicBlock *Latch = TheLoop->getLoopLatch(); 1501 DEBUG(dbgs() << "LV: Found a loop: " << Header->getName() << "\n"); 1502 1503 // ScalarEvolution needs to be able to find the exit count. 1504 const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); 1505 if (ExitCount == SE->getCouldNotCompute()) { 1506 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1507 return false; 1508 } 1509 1510 // Do not loop-vectorize loops with a tiny trip count. 1511 unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); 1512 if (TC > 0u && TC < TinyTripCountThreshold) { 1513 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1514 "This loop is not worth vectorizing.\n"); 1515 return false; 1516 } 1517 1518 // Check if we can vectorize the instructions and CFG in this loop. 1519 if (!canVectorizeInstrs(*Header)) { 1520 DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 1521 return false; 1522 } 1523 1524 // Go over each instruction and look at memory deps. 1525 if (!canVectorizeMemory()) { 1526 DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 1527 return false; 1528 } 1529 1530 // Collect all of the variables that remain uniform after vectorization. 1531 collectLoopUniforms(); 1532 1533 DEBUG(dbgs() << "LV: We can vectorize this loop" << 1534 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 1535 <<"!\n"); 1536 1537 // Okay! We can vectorize. At this point we don't have any other mem analysis 1538 // which may limit our maximum vectorization factor, so just return true with 1539 // no restrictions. 1540 return true; 1541 } 1542 1543 bool LoopVectorizationLegality::canVectorizeInstrs(BasicBlock &BB) { 1544 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 1545 BasicBlock *Header = TheLoop->getHeader(); 1546 1547 // For each block in the loop 1548 for (Loop::block_iterator bb = TheLoop->block_begin(), 1549 be = TheLoop->block_end(); bb != be; ++bb) { 1550 1551 // Scan the instructions in the block and look for hazards. 1552 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1553 Instruction *I = it; 1554 1555 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 1556 // This should not happen because the loop should be normalized. 1557 if (Phi->getNumIncomingValues() != 2) { 1558 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1559 return false; 1560 } 1561 1562 // If this PHINode is not in the header block, then we know that we 1563 // can convert it to select during if-conversion. 1564 if (*bb != Header) { 1565 continue; 1566 } 1567 1568 // This is the value coming from the preheader. 1569 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 1570 1571 // We only look at integer and pointer phi nodes. 1572 if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) { 1573 DEBUG(dbgs() << "LV: Found a pointer induction variable.\n"); 1574 Inductions[Phi] = StartValue; 1575 continue; 1576 } else if (!Phi->getType()->isIntegerTy()) { 1577 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 1578 return false; 1579 } 1580 1581 // Handle integer PHIs: 1582 if (isInductionVariable(Phi)) { 1583 if (Induction) { 1584 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1585 return false; 1586 } 1587 DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); 1588 Induction = Phi; 1589 Inductions[Phi] = StartValue; 1590 continue; 1591 } 1592 if (AddReductionVar(Phi, IntegerAdd)) { 1593 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1594 continue; 1595 } 1596 if (AddReductionVar(Phi, IntegerMult)) { 1597 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1598 continue; 1599 } 1600 if (AddReductionVar(Phi, IntegerOr)) { 1601 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1602 continue; 1603 } 1604 if (AddReductionVar(Phi, IntegerAnd)) { 1605 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1606 continue; 1607 } 1608 if (AddReductionVar(Phi, IntegerXor)) { 1609 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1610 continue; 1611 } 1612 1613 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1614 return false; 1615 }// end of PHI handling 1616 1617 // We still don't handle functions. 1618 CallInst *CI = dyn_cast<CallInst>(I); 1619 if (CI) { 1620 DEBUG(dbgs() << "LV: Found a call site.\n"); 1621 return false; 1622 } 1623 1624 // We do not re-vectorize vectors. 1625 if (!VectorType::isValidElementType(I->getType()) && 1626 !I->getType()->isVoidTy()) { 1627 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1628 return false; 1629 } 1630 1631 // Reduction instructions are allowed to have exit users. 1632 // All other instructions must not have external users. 1633 if (!AllowedExit.count(I)) 1634 //Check that all of the users of the loop are inside the BB. 1635 for (Value::use_iterator it = I->use_begin(), e = I->use_end(); 1636 it != e; ++it) { 1637 Instruction *U = cast<Instruction>(*it); 1638 // This user may be a reduction exit value. 1639 if (!TheLoop->contains(U)) { 1640 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1641 return false; 1642 } 1643 } 1644 } // next instr. 1645 1646 } 1647 1648 if (!Induction) { 1649 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1650 assert(getInductionVars()->size() && "No induction variables"); 1651 } 1652 1653 return true; 1654 } 1655 1656 void LoopVectorizationLegality::collectLoopUniforms() { 1657 // We now know that the loop is vectorizable! 1658 // Collect variables that will remain uniform after vectorization. 1659 std::vector<Value*> Worklist; 1660 1661 BasicBlock *Latch = TheLoop->getLoopLatch(); 1662 1663 // Start with the conditional branch and walk up the block. 1664 Worklist.push_back(Latch->getTerminator()->getOperand(0)); 1665 1666 while (Worklist.size()) { 1667 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1668 Worklist.pop_back(); 1669 1670 // Look at instructions inside this loop. 1671 // Stop when reaching PHI nodes. 1672 // TODO: we need to prevent loops but we do need to follow PHIs inside this 1673 // loop. 1674 if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 1675 continue; 1676 1677 // This is a known uniform. 1678 Uniforms.insert(I); 1679 1680 // Insert all operands. 1681 for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { 1682 Worklist.push_back(I->getOperand(i)); 1683 } 1684 } 1685 } 1686 1687 bool LoopVectorizationLegality::canVectorizeMemory() { 1688 typedef SmallVector<Value*, 16> ValueVector; 1689 typedef SmallPtrSet<Value*, 16> ValueSet; 1690 // Holds the Load and Store *instructions*. 1691 ValueVector Loads; 1692 ValueVector Stores; 1693 PtrRtCheck.Pointers.clear(); 1694 PtrRtCheck.Need = false; 1695 1696 // For each block. 1697 for (Loop::block_iterator bb = TheLoop->block_begin(), 1698 be = TheLoop->block_end(); bb != be; ++bb) { 1699 1700 // Scan the BB and collect legal loads and stores. 1701 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 1702 ++it) { 1703 1704 // If this is a load, save it. If this instruction can read from memory 1705 // but is not a load, then we quit. Notice that we don't handle function 1706 // calls that read or write. 1707 if (it->mayReadFromMemory()) { 1708 LoadInst *Ld = dyn_cast<LoadInst>(it); 1709 if (!Ld) return false; 1710 if (!Ld->isSimple()) { 1711 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1712 return false; 1713 } 1714 Loads.push_back(Ld); 1715 continue; 1716 } 1717 1718 // Save 'store' instructions. Abort if other instructions write to memory. 1719 if (it->mayWriteToMemory()) { 1720 StoreInst *St = dyn_cast<StoreInst>(it); 1721 if (!St) return false; 1722 if (!St->isSimple()) { 1723 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1724 return false; 1725 } 1726 Stores.push_back(St); 1727 } 1728 } // next instr. 1729 } // next block. 1730 1731 // Now we have two lists that hold the loads and the stores. 1732 // Next, we find the pointers that they use. 1733 1734 // Check if we see any stores. If there are no stores, then we don't 1735 // care if the pointers are *restrict*. 1736 if (!Stores.size()) { 1737 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1738 return true; 1739 } 1740 1741 // Holds the read and read-write *pointers* that we find. 1742 ValueVector Reads; 1743 ValueVector ReadWrites; 1744 1745 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1746 // multiple times on the same object. If the ptr is accessed twice, once 1747 // for read and once for write, it will only appear once (on the write 1748 // list). This is okay, since we are going to check for conflicts between 1749 // writes and between reads and writes, but not between reads and reads. 1750 ValueSet Seen; 1751 1752 ValueVector::iterator I, IE; 1753 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1754 StoreInst *ST = dyn_cast<StoreInst>(*I); 1755 assert(ST && "Bad StoreInst"); 1756 Value* Ptr = ST->getPointerOperand(); 1757 1758 if (isUniform(Ptr)) { 1759 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 1760 return false; 1761 } 1762 1763 // If we did *not* see this pointer before, insert it to 1764 // the read-write list. At this phase it is only a 'write' list. 1765 if (Seen.insert(Ptr)) 1766 ReadWrites.push_back(Ptr); 1767 } 1768 1769 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1770 LoadInst *LD = dyn_cast<LoadInst>(*I); 1771 assert(LD && "Bad LoadInst"); 1772 Value* Ptr = LD->getPointerOperand(); 1773 // If we did *not* see this pointer before, insert it to the 1774 // read list. If we *did* see it before, then it is already in 1775 // the read-write list. This allows us to vectorize expressions 1776 // such as A[i] += x; Because the address of A[i] is a read-write 1777 // pointer. This only works if the index of A[i] is consecutive. 1778 // If the address of i is unknown (for example A[B[i]]) then we may 1779 // read a few words, modify, and write a few words, and some of the 1780 // words may be written to the same address. 1781 if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) 1782 Reads.push_back(Ptr); 1783 } 1784 1785 // If we write (or read-write) to a single destination and there are no 1786 // other reads in this loop then is it safe to vectorize. 1787 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1788 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1789 return true; 1790 } 1791 1792 // Find pointers with computable bounds. We are going to use this information 1793 // to place a runtime bound check. 1794 bool RT = true; 1795 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 1796 if (hasComputableBounds(*I)) { 1797 PtrRtCheck.insert(SE, TheLoop, *I); 1798 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1799 } else { 1800 RT = false; 1801 break; 1802 } 1803 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 1804 if (hasComputableBounds(*I)) { 1805 PtrRtCheck.insert(SE, TheLoop, *I); 1806 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1807 } else { 1808 RT = false; 1809 break; 1810 } 1811 1812 // Check that we did not collect too many pointers or found a 1813 // unsizeable pointer. 1814 if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 1815 PtrRtCheck.reset(); 1816 RT = false; 1817 } 1818 1819 PtrRtCheck.Need = RT; 1820 1821 if (RT) { 1822 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 1823 } 1824 1825 // Now that the pointers are in two lists (Reads and ReadWrites), we 1826 // can check that there are no conflicts between each of the writes and 1827 // between the writes to the reads. 1828 ValueSet WriteObjects; 1829 ValueVector TempObjects; 1830 1831 // Check that the read-writes do not conflict with other read-write 1832 // pointers. 1833 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1834 GetUnderlyingObjects(*I, TempObjects, DL); 1835 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1836 it != e; ++it) { 1837 if (!isIdentifiedObject(*it)) { 1838 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1839 return RT; 1840 } 1841 if (!WriteObjects.insert(*it)) { 1842 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1843 << **it <<"\n"); 1844 return RT; 1845 } 1846 } 1847 TempObjects.clear(); 1848 } 1849 1850 /// Check that the reads don't conflict with the read-writes. 1851 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1852 GetUnderlyingObjects(*I, TempObjects, DL); 1853 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1854 it != e; ++it) { 1855 if (!isIdentifiedObject(*it)) { 1856 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1857 return RT; 1858 } 1859 if (WriteObjects.count(*it)) { 1860 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1861 << **it <<"\n"); 1862 return RT; 1863 } 1864 } 1865 TempObjects.clear(); 1866 } 1867 1868 // It is safe to vectorize and we don't need any runtime checks. 1869 DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); 1870 PtrRtCheck.reset(); 1871 return true; 1872 } 1873 1874 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1875 ReductionKind Kind) { 1876 if (Phi->getNumIncomingValues() != 2) 1877 return false; 1878 1879 // Find the possible incoming reduction variable. 1880 BasicBlock *BB = Phi->getParent(); 1881 int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); 1882 int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. 1883 Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); 1884 1885 // ExitInstruction is the single value which is used outside the loop. 1886 // We only allow for a single reduction value to be used outside the loop. 1887 // This includes users of the reduction, variables (which form a cycle 1888 // which ends in the phi node). 1889 Instruction *ExitInstruction = 0; 1890 1891 // Iter is our iterator. We start with the PHI node and scan for all of the 1892 // users of this instruction. All users must be instructions which can be 1893 // used as reduction variables (such as ADD). We may have a single 1894 // out-of-block user. They cycle must end with the original PHI. 1895 // Also, we can't have multiple block-local users. 1896 Instruction *Iter = Phi; 1897 while (true) { 1898 // Any reduction instr must be of one of the allowed kinds. 1899 if (!isReductionInstr(Iter, Kind)) 1900 return false; 1901 1902 // Did we found a user inside this block ? 1903 bool FoundInBlockUser = false; 1904 // Did we reach the initial PHI node ? 1905 bool FoundStartPHI = false; 1906 1907 // If the instruction has no users then this is a broken 1908 // chain and can't be a reduction variable. 1909 if (Iter->use_empty()) 1910 return false; 1911 1912 // For each of the *users* of iter. 1913 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1914 it != e; ++it) { 1915 Instruction *U = cast<Instruction>(*it); 1916 // We already know that the PHI is a user. 1917 if (U == Phi) { 1918 FoundStartPHI = true; 1919 continue; 1920 } 1921 // Check if we found the exit user. 1922 BasicBlock *Parent = U->getParent(); 1923 if (Parent != BB) { 1924 // We must have a single exit instruction. 1925 if (ExitInstruction != 0) 1926 return false; 1927 ExitInstruction = Iter; 1928 } 1929 // We can't have multiple inside users. 1930 if (FoundInBlockUser) 1931 return false; 1932 FoundInBlockUser = true; 1933 Iter = U; 1934 } 1935 1936 // We found a reduction var if we have reached the original 1937 // phi node and we only have a single instruction with out-of-loop 1938 // users. 1939 if (FoundStartPHI && ExitInstruction) { 1940 // This instruction is allowed to have out-of-loop users. 1941 AllowedExit.insert(ExitInstruction); 1942 1943 // Save the description of this reduction variable. 1944 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1945 Reductions[Phi] = RD; 1946 return true; 1947 } 1948 } 1949 } 1950 1951 bool 1952 LoopVectorizationLegality::isReductionInstr(Instruction *I, 1953 ReductionKind Kind) { 1954 switch (I->getOpcode()) { 1955 default: 1956 return false; 1957 case Instruction::PHI: 1958 // possibly. 1959 return true; 1960 case Instruction::Add: 1961 case Instruction::Sub: 1962 return Kind == IntegerAdd; 1963 case Instruction::Mul: 1964 return Kind == IntegerMult; 1965 case Instruction::And: 1966 return Kind == IntegerAnd; 1967 case Instruction::Or: 1968 return Kind == IntegerOr; 1969 case Instruction::Xor: 1970 return Kind == IntegerXor; 1971 } 1972 } 1973 1974 bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1975 Type *PhiTy = Phi->getType(); 1976 // We only handle integer and pointer inductions variables. 1977 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 1978 return false; 1979 1980 // Check that the PHI is consecutive and starts at zero. 1981 const SCEV *PhiScev = SE->getSCEV(Phi); 1982 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1983 if (!AR) { 1984 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1985 return false; 1986 } 1987 const SCEV *Step = AR->getStepRecurrence(*SE); 1988 1989 // Integer inductions need to have a stride of one. 1990 if (PhiTy->isIntegerTy()) 1991 return Step->isOne(); 1992 1993 // Calculate the pointer stride and check if it is consecutive. 1994 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 1995 if (!C) return false; 1996 1997 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 1998 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 1999 return (C->getValue()->equalsInt(Size)); 2000 } 2001 2002 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 2003 assert(TheLoop->contains(BB) && "Unknown block used"); 2004 2005 // Blocks that do not dominate the latch need predication. 2006 BasicBlock* Latch = TheLoop->getLoopLatch(); 2007 return !DT->dominates(BB, Latch); 2008 } 2009 2010 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { 2011 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2012 // We don't predicate loads/stores at the moment. 2013 if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) 2014 return false; 2015 2016 // The isntructions below can trap. 2017 switch (it->getOpcode()) { 2018 default: continue; 2019 case Instruction::UDiv: 2020 case Instruction::SDiv: 2021 case Instruction::URem: 2022 case Instruction::SRem: 2023 return false; 2024 } 2025 } 2026 2027 return true; 2028 } 2029 2030 bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 2031 const SCEV *PhiScev = SE->getSCEV(Ptr); 2032 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 2033 if (!AR) 2034 return false; 2035 2036 return AR->isAffine(); 2037 } 2038 2039 unsigned 2040 LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { 2041 if (!VTTI) { 2042 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 2043 return 1; 2044 } 2045 2046 float Cost = expectedCost(1); 2047 unsigned Width = 1; 2048 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 2049 for (unsigned i=2; i <= VF; i*=2) { 2050 // Notice that the vector loop needs to be executed less times, so 2051 // we need to divide the cost of the vector loops by the width of 2052 // the vector elements. 2053 float VectorCost = expectedCost(i) / (float)i; 2054 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 2055 (int)VectorCost << ".\n"); 2056 if (VectorCost < Cost) { 2057 Cost = VectorCost; 2058 Width = i; 2059 } 2060 } 2061 2062 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 2063 return Width; 2064 } 2065 2066 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 2067 // We can only estimate the cost of single basic block loops. 2068 assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); 2069 2070 BasicBlock *BB = TheLoop->getHeader(); 2071 unsigned Cost = 0; 2072 2073 // For each instruction in the old loop. 2074 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2075 Instruction *Inst = it; 2076 unsigned C = getInstructionCost(Inst, VF); 2077 Cost += C; 2078 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << 2079 " For instruction: "<< *Inst << "\n"); 2080 } 2081 2082 return Cost; 2083 } 2084 2085 unsigned 2086 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 2087 assert(VTTI && "Invalid vector target transformation info"); 2088 2089 // If we know that this instruction will remain uniform, check the cost of 2090 // the scalar version. 2091 if (Legal->isUniformAfterVectorization(I)) 2092 VF = 1; 2093 2094 Type *RetTy = I->getType(); 2095 Type *VectorTy = ToVectorTy(RetTy, VF); 2096 2097 2098 // TODO: We need to estimate the cost of intrinsic calls. 2099 switch (I->getOpcode()) { 2100 case Instruction::GetElementPtr: 2101 // We mark this instruction as zero-cost because scalar GEPs are usually 2102 // lowered to the intruction addressing mode. At the moment we don't 2103 // generate vector geps. 2104 return 0; 2105 case Instruction::Br: { 2106 return VTTI->getCFInstrCost(I->getOpcode()); 2107 } 2108 case Instruction::PHI: 2109 return 0; 2110 case Instruction::Add: 2111 case Instruction::FAdd: 2112 case Instruction::Sub: 2113 case Instruction::FSub: 2114 case Instruction::Mul: 2115 case Instruction::FMul: 2116 case Instruction::UDiv: 2117 case Instruction::SDiv: 2118 case Instruction::FDiv: 2119 case Instruction::URem: 2120 case Instruction::SRem: 2121 case Instruction::FRem: 2122 case Instruction::Shl: 2123 case Instruction::LShr: 2124 case Instruction::AShr: 2125 case Instruction::And: 2126 case Instruction::Or: 2127 case Instruction::Xor: 2128 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 2129 case Instruction::Select: { 2130 SelectInst *SI = cast<SelectInst>(I); 2131 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 2132 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 2133 Type *CondTy = SI->getCondition()->getType(); 2134 if (ScalarCond) 2135 CondTy = VectorType::get(CondTy, VF); 2136 2137 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 2138 } 2139 case Instruction::ICmp: 2140 case Instruction::FCmp: { 2141 Type *ValTy = I->getOperand(0)->getType(); 2142 VectorTy = ToVectorTy(ValTy, VF); 2143 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 2144 } 2145 case Instruction::Store: { 2146 StoreInst *SI = cast<StoreInst>(I); 2147 Type *ValTy = SI->getValueOperand()->getType(); 2148 VectorTy = ToVectorTy(ValTy, VF); 2149 2150 if (VF == 1) 2151 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 2152 SI->getAlignment(), SI->getPointerAddressSpace()); 2153 2154 // Scalarized stores. 2155 if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { 2156 unsigned Cost = 0; 2157 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2158 ValTy); 2159 // The cost of extracting from the value vector. 2160 Cost += VF * (ExtCost); 2161 // The cost of the scalar stores. 2162 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2163 ValTy->getScalarType(), 2164 SI->getAlignment(), 2165 SI->getPointerAddressSpace()); 2166 return Cost; 2167 } 2168 2169 // Wide stores. 2170 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 2171 SI->getPointerAddressSpace()); 2172 } 2173 case Instruction::Load: { 2174 LoadInst *LI = cast<LoadInst>(I); 2175 2176 if (VF == 1) 2177 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 2178 LI->getAlignment(), 2179 LI->getPointerAddressSpace()); 2180 2181 // Scalarized loads. 2182 if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { 2183 unsigned Cost = 0; 2184 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 2185 // The cost of inserting the loaded value into the result vector. 2186 Cost += VF * (InCost); 2187 // The cost of the scalar stores. 2188 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2189 RetTy->getScalarType(), 2190 LI->getAlignment(), 2191 LI->getPointerAddressSpace()); 2192 return Cost; 2193 } 2194 2195 // Wide loads. 2196 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 2197 LI->getPointerAddressSpace()); 2198 } 2199 case Instruction::ZExt: 2200 case Instruction::SExt: 2201 case Instruction::FPToUI: 2202 case Instruction::FPToSI: 2203 case Instruction::FPExt: 2204 case Instruction::PtrToInt: 2205 case Instruction::IntToPtr: 2206 case Instruction::SIToFP: 2207 case Instruction::UIToFP: 2208 case Instruction::Trunc: 2209 case Instruction::FPTrunc: 2210 case Instruction::BitCast: { 2211 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2212 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 2213 } 2214 default: { 2215 // We are scalarizing the instruction. Return the cost of the scalar 2216 // instruction, plus the cost of insert and extract into vector 2217 // elements, times the vector width. 2218 unsigned Cost = 0; 2219 2220 bool IsVoid = RetTy->isVoidTy(); 2221 2222 unsigned InsCost = (IsVoid ? 0 : 2223 VTTI->getInstrCost(Instruction::InsertElement, 2224 VectorTy)); 2225 2226 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2227 VectorTy); 2228 2229 // The cost of inserting the results plus extracting each one of the 2230 // operands. 2231 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 2232 2233 // The cost of executing VF copies of the scalar instruction. 2234 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 2235 return Cost; 2236 } 2237 }// end of switch. 2238 } 2239 2240 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 2241 if (Scalar->isVoidTy() || VF == 1) 2242 return Scalar; 2243 return VectorType::get(Scalar, VF); 2244 } 2245 2246 } // namespace 2247 2248 char LoopVectorize::ID = 0; 2249 static const char lv_name[] = "Loop Vectorization"; 2250 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 2251 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 2252 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 2253 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 2254 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 2255 2256 namespace llvm { 2257 Pass *createLoopVectorizePass() { 2258 return new LoopVectorize(); 2259 } 2260 } 2261 2262