1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11 // and generates target-independent LLVM-IR. Legalization of the IR is done 12 // in the codegen. However, the vectorizes uses (will use) the codegen 13 // interfaces to generate IR that is likely to result in an optimal binary. 14 // 15 // The loop vectorizer combines consecutive loop iteration into a single 16 // 'wide' iteration. After this transformation the index is incremented 17 // by the SIMD vector width, and not by one. 18 // 19 // This pass has three parts: 20 // 1. The main loop pass that drives the different parts. 21 // 2. LoopVectorizationLegality - A unit that checks for the legality 22 // of the vectorization. 23 // 3. SingleBlockLoopVectorizer - A unit that performs the actual 24 // widening of instructions. 25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 26 // of vectorization. It decides on the optimal vector width, which 27 // can be one, if vectorization is not profitable. 28 // 29 //===----------------------------------------------------------------------===// 30 // 31 // The reduction-variable vectorization is based on the paper: 32 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33 // 34 // Variable uniformity checks are inspired by: 35 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 36 // 37 // Other ideas/concepts are from: 38 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 39 // 40 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 41 // Vectorizing Compilers. 42 // 43 //===----------------------------------------------------------------------===// 44 #define LV_NAME "loop-vectorize" 45 #define DEBUG_TYPE LV_NAME 46 #include "llvm/Constants.h" 47 #include "llvm/DerivedTypes.h" 48 #include "llvm/Instructions.h" 49 #include "llvm/LLVMContext.h" 50 #include "llvm/Pass.h" 51 #include "llvm/Analysis/LoopPass.h" 52 #include "llvm/Value.h" 53 #include "llvm/Function.h" 54 #include "llvm/Analysis/Verifier.h" 55 #include "llvm/Module.h" 56 #include "llvm/Type.h" 57 #include "llvm/ADT/SmallVector.h" 58 #include "llvm/ADT/StringExtras.h" 59 #include "llvm/Analysis/AliasAnalysis.h" 60 #include "llvm/Analysis/AliasSetTracker.h" 61 #include "llvm/Analysis/ScalarEvolution.h" 62 #include "llvm/Analysis/Dominators.h" 63 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 64 #include "llvm/Analysis/ScalarEvolutionExpander.h" 65 #include "llvm/Analysis/LoopInfo.h" 66 #include "llvm/Analysis/ValueTracking.h" 67 #include "llvm/Transforms/Scalar.h" 68 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 69 #include "llvm/TargetTransformInfo.h" 70 #include "llvm/Support/CommandLine.h" 71 #include "llvm/Support/Debug.h" 72 #include "llvm/Support/raw_ostream.h" 73 #include "llvm/DataLayout.h" 74 #include "llvm/Transforms/Utils/Local.h" 75 #include <algorithm> 76 using namespace llvm; 77 78 static cl::opt<unsigned> 79 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 80 cl::desc("Set the default vectorization width. Zero is autoselect.")); 81 82 /// We don't vectorize loops with a known constant trip count below this number. 83 const unsigned TinyTripCountThreshold = 16; 84 85 /// When performing a runtime memory check, do not check more than this 86 /// number of pointers. Notice that the check is quadratic! 87 const unsigned RuntimeMemoryCheckThreshold = 2; 88 89 /// This is the highest vector width that we try to generate. 90 const unsigned MaxVectorSize = 8; 91 92 namespace { 93 94 // Forward declarations. 95 class LoopVectorizationLegality; 96 class LoopVectorizationCostModel; 97 98 /// SingleBlockLoopVectorizer vectorizes loops which contain only one basic 99 /// block to a specified vectorization factor (VF). 100 /// This class performs the widening of scalars into vectors, or multiple 101 /// scalars. This class also implements the following features: 102 /// * It inserts an epilogue loop for handling loops that don't have iteration 103 /// counts that are known to be a multiple of the vectorization factor. 104 /// * It handles the code generation for reduction variables. 105 /// * Scalarization (implementation using scalars) of un-vectorizable 106 /// instructions. 107 /// SingleBlockLoopVectorizer does not perform any vectorization-legality 108 /// checks, and relies on the caller to check for the different legality 109 /// aspects. The SingleBlockLoopVectorizer relies on the 110 /// LoopVectorizationLegality class to provide information about the induction 111 /// and reduction variables that were found to a given vectorization factor. 112 class SingleBlockLoopVectorizer { 113 public: 114 /// Ctor. 115 SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, 116 DominatorTree *Dt, DataLayout *Dl, 117 LPPassManager *Lpm, 118 unsigned VecWidth): 119 OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), LPM(Lpm), VF(VecWidth), 120 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 121 122 // Perform the actual loop widening (vectorization). 123 void vectorize(LoopVectorizationLegality *Legal) { 124 // Create a new empty loop. Unlink the old loop and connect the new one. 125 createEmptyLoop(Legal); 126 // Widen each instruction in the old loop to a new one in the new loop. 127 // Use the Legality module to find the induction and reduction variables. 128 vectorizeLoop(Legal); 129 // Register the new loop and update the analysis passes. 130 updateAnalysis(); 131 } 132 133 private: 134 /// Add code that checks at runtime if the accessed arrays overlap. 135 /// Returns the comperator value or NULL if no check is needed. 136 Value *addRuntimeCheck(LoopVectorizationLegality *Legal, 137 Instruction *Loc); 138 /// Create an empty loop, based on the loop ranges of the old loop. 139 void createEmptyLoop(LoopVectorizationLegality *Legal); 140 /// Copy and widen the instructions from the old loop. 141 void vectorizeLoop(LoopVectorizationLegality *Legal); 142 /// Insert the new loop to the loop hierarchy and pass manager 143 /// and update the analysis passes. 144 void updateAnalysis(); 145 146 /// This instruction is un-vectorizable. Implement it as a sequence 147 /// of scalars. 148 void scalarizeInstruction(Instruction *Instr); 149 150 /// Create a broadcast instruction. This method generates a broadcast 151 /// instruction (shuffle) for loop invariant values and for the induction 152 /// value. If this is the induction variable then we extend it to N, N+1, ... 153 /// this is needed because each iteration in the loop corresponds to a SIMD 154 /// element. 155 Value *getBroadcastInstrs(Value *V); 156 157 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 158 /// for each element in the vector. Starting from zero. 159 Value *getConsecutiveVector(Value* Val); 160 161 /// When we go over instructions in the basic block we rely on previous 162 /// values within the current basic block or on loop invariant values. 163 /// When we widen (vectorize) values we place them in the map. If the values 164 /// are not within the map, they have to be loop invariant, so we simply 165 /// broadcast them into a vector. 166 Value *getVectorValue(Value *V); 167 168 /// Get a uniform vector of constant integers. We use this to get 169 /// vectors of ones and zeros for the reduction code. 170 Constant* getUniformVector(unsigned Val, Type* ScalarTy); 171 172 typedef DenseMap<Value*, Value*> ValueMap; 173 174 /// The original loop. 175 Loop *OrigLoop; 176 // Scev analysis to use. 177 ScalarEvolution *SE; 178 // Loop Info. 179 LoopInfo *LI; 180 // Dominator Tree. 181 DominatorTree *DT; 182 // Data Layout. 183 DataLayout *DL; 184 // Loop Pass Manager; 185 LPPassManager *LPM; 186 // The vectorization factor to use. 187 unsigned VF; 188 189 // The builder that we use 190 IRBuilder<> Builder; 191 192 // --- Vectorization state --- 193 194 /// The vector-loop preheader. 195 BasicBlock *LoopVectorPreHeader; 196 /// The scalar-loop preheader. 197 BasicBlock *LoopScalarPreHeader; 198 /// Middle Block between the vector and the scalar. 199 BasicBlock *LoopMiddleBlock; 200 ///The ExitBlock of the scalar loop. 201 BasicBlock *LoopExitBlock; 202 ///The vector loop body. 203 BasicBlock *LoopVectorBody; 204 ///The scalar loop body. 205 BasicBlock *LoopScalarBody; 206 ///The first bypass block. 207 BasicBlock *LoopBypassBlock; 208 209 /// The new Induction variable which was added to the new block. 210 PHINode *Induction; 211 /// The induction variable of the old basic block. 212 PHINode *OldInduction; 213 // Maps scalars to widened vectors. 214 ValueMap WidenMap; 215 }; 216 217 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 218 /// to what vectorization factor. 219 /// This class does not look at the profitability of vectorization, only the 220 /// legality. This class has two main kinds of checks: 221 /// * Memory checks - The code in canVectorizeMemory checks if vectorization 222 /// will change the order of memory accesses in a way that will change the 223 /// correctness of the program. 224 /// * Scalars checks - The code in canVectorizeBlock checks for a number 225 /// of different conditions, such as the availability of a single induction 226 /// variable, that all types are supported and vectorize-able, etc. 227 /// This code reflects the capabilities of SingleBlockLoopVectorizer. 228 /// This class is also used by SingleBlockLoopVectorizer for identifying 229 /// induction variable and the different reduction variables. 230 class LoopVectorizationLegality { 231 public: 232 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): 233 TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } 234 235 /// This represents the kinds of reductions that we support. 236 enum ReductionKind { 237 NoReduction, /// Not a reduction. 238 IntegerAdd, /// Sum of numbers. 239 IntegerMult, /// Product of numbers. 240 IntegerOr, /// Bitwise or logical OR of numbers. 241 IntegerAnd, /// Bitwise or logical AND of numbers. 242 IntegerXor /// Bitwise or logical XOR of numbers. 243 }; 244 245 /// This POD struct holds information about reduction variables. 246 struct ReductionDescriptor { 247 // Default C'tor 248 ReductionDescriptor(): 249 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} 250 251 // C'tor. 252 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): 253 StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 254 255 // The starting value of the reduction. 256 // It does not have to be zero! 257 Value *StartValue; 258 // The instruction who's value is used outside the loop. 259 Instruction *LoopExitInstr; 260 // The kind of the reduction. 261 ReductionKind Kind; 262 }; 263 264 // This POD struct holds information about the memory runtime legality 265 // check that a group of pointers do not overlap. 266 struct RuntimePointerCheck { 267 RuntimePointerCheck(): Need(false) {} 268 269 /// Reset the state of the pointer runtime information. 270 void reset() { 271 Need = false; 272 Pointers.clear(); 273 Starts.clear(); 274 Ends.clear(); 275 } 276 277 /// Insert a pointer and calculate the start and end SCEVs. 278 void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) { 279 const SCEV *Sc = SE->getSCEV(Ptr); 280 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 281 assert(AR && "Invalid addrec expression"); 282 const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader()); 283 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 284 Pointers.push_back(Ptr); 285 Starts.push_back(AR->getStart()); 286 Ends.push_back(ScEnd); 287 } 288 289 /// This flag indicates if we need to add the runtime check. 290 bool Need; 291 /// Holds the pointers that we need to check. 292 SmallVector<Value*, 2> Pointers; 293 /// Holds the pointer value at the beginning of the loop. 294 SmallVector<const SCEV*, 2> Starts; 295 /// Holds the pointer value at the end of the loop. 296 SmallVector<const SCEV*, 2> Ends; 297 }; 298 299 /// ReductionList contains the reduction descriptors for all 300 /// of the reductions that were found in the loop. 301 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 302 303 /// InductionList saves induction variables and maps them to the initial 304 /// value entring the loop. 305 typedef DenseMap<PHINode*, Value*> InductionList; 306 307 /// Returns true if it is legal to vectorize this loop. 308 /// This does not mean that it is profitable to vectorize this 309 /// loop, only that it is legal to do so. 310 bool canVectorize(); 311 312 /// Returns the Induction variable. 313 PHINode *getInduction() {return Induction;} 314 315 /// Returns the reduction variables found in the loop. 316 ReductionList *getReductionVars() { return &Reductions; } 317 318 /// Returns the induction variables found in the loop. 319 InductionList *getInductionVars() { return &Inductions; } 320 321 /// Check if this pointer is consecutive when vectorizing. This happens 322 /// when the last index of the GEP is the induction variable, or that the 323 /// pointer itself is an induction variable. 324 /// This check allows us to vectorize A[idx] into a wide load/store. 325 bool isConsecutivePtr(Value *Ptr); 326 327 /// Returns true if the value V is uniform within the loop. 328 bool isUniform(Value *V); 329 330 /// Returns true if this instruction will remain scalar after vectorization. 331 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} 332 333 /// Returns the information that we collected about runtime memory check. 334 RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; } 335 private: 336 /// Check if a single basic block loop is vectorizable. 337 /// At this point we know that this is a loop with a constant trip count 338 /// and we only need to check individual instructions. 339 bool canVectorizeBlock(BasicBlock &BB); 340 341 /// When we vectorize loops we may change the order in which 342 /// we read and write from memory. This method checks if it is 343 /// legal to vectorize the code, considering only memory constrains. 344 /// Returns true if BB is vectorizable 345 bool canVectorizeMemory(BasicBlock &BB); 346 347 /// Returns True, if 'Phi' is the kind of reduction variable for type 348 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 349 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 350 /// Returns true if the instruction I can be a reduction variable of type 351 /// 'Kind'. 352 bool isReductionInstr(Instruction *I, ReductionKind Kind); 353 /// Returns True, if 'Phi' is an induction variable. 354 bool isInductionVariable(PHINode *Phi); 355 /// Return true if can compute the address bounds of Ptr within the loop. 356 bool hasComputableBounds(Value *Ptr); 357 358 /// The loop that we evaluate. 359 Loop *TheLoop; 360 /// Scev analysis. 361 ScalarEvolution *SE; 362 /// DataLayout analysis. 363 DataLayout *DL; 364 365 // --- vectorization state --- // 366 367 /// Holds the integer induction variable. This is the counter of the 368 /// loop. 369 PHINode *Induction; 370 /// Holds the reduction variables. 371 ReductionList Reductions; 372 /// Holds all of the induction variables that we found in the loop. 373 /// Notice that inductions don't need to start at zero and that induction 374 /// variables can be pointers. 375 InductionList Inductions; 376 377 /// Allowed outside users. This holds the reduction 378 /// vars which can be accessed from outside the loop. 379 SmallPtrSet<Value*, 4> AllowedExit; 380 /// This set holds the variables which are known to be uniform after 381 /// vectorization. 382 SmallPtrSet<Instruction*, 4> Uniforms; 383 /// We need to check that all of the pointers in this list are disjoint 384 /// at runtime. 385 RuntimePointerCheck PtrRtCheck; 386 }; 387 388 /// LoopVectorizationCostModel - estimates the expected speedups due to 389 /// vectorization. 390 /// In many cases vectorization is not profitable. This can happen because 391 /// of a number of reasons. In this class we mainly attempt to predict 392 /// the expected speedup/slowdowns due to the supported instruction set. 393 /// We use the VectorTargetTransformInfo to query the different backends 394 /// for the cost of different operations. 395 class LoopVectorizationCostModel { 396 public: 397 /// C'tor. 398 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, 399 LoopVectorizationLegality *Leg, 400 const VectorTargetTransformInfo *Vtti): 401 TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } 402 403 /// Returns the most profitable vectorization factor for the loop that is 404 /// smaller or equal to the VF argument. This method checks every power 405 /// of two up to VF. 406 unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize); 407 408 private: 409 /// Returns the expected execution cost. The unit of the cost does 410 /// not matter because we use the 'cost' units to compare different 411 /// vector widths. The cost that is returned is *not* normalized by 412 /// the factor width. 413 unsigned expectedCost(unsigned VF); 414 415 /// Returns the execution time cost of an instruction for a given vector 416 /// width. Vector width of one means scalar. 417 unsigned getInstructionCost(Instruction *I, unsigned VF); 418 419 /// A helper function for converting Scalar types to vector types. 420 /// If the incoming type is void, we return void. If the VF is 1, we return 421 /// the scalar type. 422 static Type* ToVectorTy(Type *Scalar, unsigned VF); 423 424 /// The loop that we evaluate. 425 Loop *TheLoop; 426 /// Scev analysis. 427 ScalarEvolution *SE; 428 429 /// Vectorization legality. 430 LoopVectorizationLegality *Legal; 431 /// Vector target information. 432 const VectorTargetTransformInfo *VTTI; 433 }; 434 435 struct LoopVectorize : public LoopPass { 436 static char ID; // Pass identification, replacement for typeid 437 438 LoopVectorize() : LoopPass(ID) { 439 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 440 } 441 442 ScalarEvolution *SE; 443 DataLayout *DL; 444 LoopInfo *LI; 445 TargetTransformInfo *TTI; 446 DominatorTree *DT; 447 448 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 449 // We only vectorize innermost loops. 450 if (!L->empty()) 451 return false; 452 453 SE = &getAnalysis<ScalarEvolution>(); 454 DL = getAnalysisIfAvailable<DataLayout>(); 455 LI = &getAnalysis<LoopInfo>(); 456 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 457 DT = &getAnalysis<DominatorTree>(); 458 459 DEBUG(dbgs() << "LV: Checking a loop in \"" << 460 L->getHeader()->getParent()->getName() << "\"\n"); 461 462 // Check if it is legal to vectorize the loop. 463 LoopVectorizationLegality LVL(L, SE, DL); 464 if (!LVL.canVectorize()) { 465 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 466 return false; 467 } 468 469 // Select the preffered vectorization factor. 470 unsigned VF = 1; 471 if (VectorizationFactor == 0) { 472 const VectorTargetTransformInfo *VTTI = 0; 473 if (TTI) 474 VTTI = TTI->getVectorTargetTransformInfo(); 475 // Use the cost model. 476 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 477 VF = CM.findBestVectorizationFactor(); 478 479 if (VF == 1) { 480 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 481 return false; 482 } 483 484 } else { 485 // Use the user command flag. 486 VF = VectorizationFactor; 487 } 488 489 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 490 L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< 491 "\n"); 492 493 // If we decided that it is *legal* to vectorizer the loop then do it. 494 SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, &LPM, VF); 495 LB.vectorize(&LVL); 496 497 DEBUG(verifyFunction(*L->getHeader()->getParent())); 498 return true; 499 } 500 501 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 502 LoopPass::getAnalysisUsage(AU); 503 AU.addRequiredID(LoopSimplifyID); 504 AU.addRequiredID(LCSSAID); 505 AU.addRequired<LoopInfo>(); 506 AU.addRequired<ScalarEvolution>(); 507 AU.addRequired<DominatorTree>(); 508 AU.addPreserved<LoopInfo>(); 509 AU.addPreserved<DominatorTree>(); 510 } 511 512 }; 513 514 Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { 515 // Create the types. 516 LLVMContext &C = V->getContext(); 517 Type *VTy = VectorType::get(V->getType(), VF); 518 Type *I32 = IntegerType::getInt32Ty(C); 519 Constant *Zero = ConstantInt::get(I32, 0); 520 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 521 Value *UndefVal = UndefValue::get(VTy); 522 // Insert the value into a new vector. 523 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 524 // Broadcast the scalar into all locations in the vector. 525 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 526 "broadcast"); 527 // We are accessing the induction variable. Make sure to promote the 528 // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. 529 if (V == Induction) 530 return getConsecutiveVector(Shuf); 531 return Shuf; 532 } 533 534 Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { 535 assert(Val->getType()->isVectorTy() && "Must be a vector"); 536 assert(Val->getType()->getScalarType()->isIntegerTy() && 537 "Elem must be an integer"); 538 // Create the types. 539 Type *ITy = Val->getType()->getScalarType(); 540 VectorType *Ty = cast<VectorType>(Val->getType()); 541 unsigned VLen = Ty->getNumElements(); 542 SmallVector<Constant*, 8> Indices; 543 544 // Create a vector of consecutive numbers from zero to VF. 545 for (unsigned i = 0; i < VLen; ++i) 546 Indices.push_back(ConstantInt::get(ITy, i)); 547 548 // Add the consecutive indices to the vector value. 549 Constant *Cv = ConstantVector::get(Indices); 550 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 551 return Builder.CreateAdd(Val, Cv, "induction"); 552 } 553 554 bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 555 assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 556 557 // If this pointer is an induction variable, return it. 558 PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 559 if (Phi && getInductionVars()->count(Phi)) 560 return true; 561 562 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 563 if (!Gep) 564 return false; 565 566 unsigned NumOperands = Gep->getNumOperands(); 567 Value *LastIndex = Gep->getOperand(NumOperands - 1); 568 569 // Check that all of the gep indices are uniform except for the last. 570 for (unsigned i = 0; i < NumOperands - 1; ++i) 571 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 572 return false; 573 574 // We can emit wide load/stores only of the last index is the induction 575 // variable. 576 const SCEV *Last = SE->getSCEV(LastIndex); 577 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 578 const SCEV *Step = AR->getStepRecurrence(*SE); 579 580 // The memory is consecutive because the last index is consecutive 581 // and all other indices are loop invariant. 582 if (Step->isOne()) 583 return true; 584 } 585 586 return false; 587 } 588 589 bool LoopVectorizationLegality::isUniform(Value *V) { 590 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 591 } 592 593 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { 594 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 595 // If we saved a vectorized copy of V, use it. 596 Value *&MapEntry = WidenMap[V]; 597 if (MapEntry) 598 return MapEntry; 599 600 // Broadcast V and save the value for future uses. 601 Value *B = getBroadcastInstrs(V); 602 MapEntry = B; 603 return B; 604 } 605 606 Constant* 607 SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 608 return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); 609 } 610 611 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 612 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 613 // Holds vector parameters or scalars, in case of uniform vals. 614 SmallVector<Value*, 8> Params; 615 616 // Find all of the vectorized parameters. 617 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 618 Value *SrcOp = Instr->getOperand(op); 619 620 // If we are accessing the old induction variable, use the new one. 621 if (SrcOp == OldInduction) { 622 Params.push_back(getVectorValue(Induction)); 623 continue; 624 } 625 626 // Try using previously calculated values. 627 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 628 629 // If the src is an instruction that appeared earlier in the basic block 630 // then it should already be vectorized. 631 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 632 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 633 // The parameter is a vector value from earlier. 634 Params.push_back(WidenMap[SrcInst]); 635 } else { 636 // The parameter is a scalar from outside the loop. Maybe even a constant. 637 Params.push_back(SrcOp); 638 } 639 } 640 641 assert(Params.size() == Instr->getNumOperands() && 642 "Invalid number of operands"); 643 644 // Does this instruction return a value ? 645 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 646 Value *VecResults = 0; 647 648 // If we have a return value, create an empty vector. We place the scalarized 649 // instructions in this vector. 650 if (!IsVoidRetTy) 651 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 652 653 // For each scalar that we create: 654 for (unsigned i = 0; i < VF; ++i) { 655 Instruction *Cloned = Instr->clone(); 656 if (!IsVoidRetTy) 657 Cloned->setName(Instr->getName() + ".cloned"); 658 // Replace the operands of the cloned instrucions with extracted scalars. 659 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 660 Value *Op = Params[op]; 661 // Param is a vector. Need to extract the right lane. 662 if (Op->getType()->isVectorTy()) 663 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 664 Cloned->setOperand(op, Op); 665 } 666 667 // Place the cloned scalar in the new loop. 668 Builder.Insert(Cloned); 669 670 // If the original scalar returns a value we need to place it in a vector 671 // so that future users will be able to use it. 672 if (!IsVoidRetTy) 673 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 674 Builder.getInt32(i)); 675 } 676 677 if (!IsVoidRetTy) 678 WidenMap[Instr] = VecResults; 679 } 680 681 Value* 682 SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 683 Instruction *Loc) { 684 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 685 Legal->getRuntimePointerCheck(); 686 687 if (!PtrRtCheck->Need) 688 return NULL; 689 690 Value *MemoryRuntimeCheck = 0; 691 unsigned NumPointers = PtrRtCheck->Pointers.size(); 692 SmallVector<Value* , 2> Starts; 693 SmallVector<Value* , 2> Ends; 694 695 SCEVExpander Exp(*SE, "induction"); 696 697 // Use this type for pointer arithmetic. 698 Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); 699 700 for (unsigned i=0; i < NumPointers; ++i) { 701 Value *Ptr = PtrRtCheck->Pointers[i]; 702 const SCEV *Sc = SE->getSCEV(Ptr); 703 704 if (SE->isLoopInvariant(Sc, OrigLoop)) { 705 DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" << 706 *Ptr <<"\n"); 707 Starts.push_back(Ptr); 708 Ends.push_back(Ptr); 709 } else { 710 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 711 712 Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], 713 PtrArithTy, Loc); 714 Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 715 Starts.push_back(Start); 716 Ends.push_back(End); 717 } 718 } 719 720 for (unsigned i = 0; i < NumPointers; ++i) { 721 for (unsigned j = i+1; j < NumPointers; ++j) { 722 Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 723 Starts[i], Ends[j], "bound0", Loc); 724 Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 725 Starts[j], Ends[i], "bound1", Loc); 726 Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, 727 "found.conflict", Loc); 728 if (MemoryRuntimeCheck) 729 MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, 730 MemoryRuntimeCheck, 731 IsConflict, 732 "conflict.rdx", Loc); 733 else 734 MemoryRuntimeCheck = IsConflict; 735 736 } 737 } 738 739 return MemoryRuntimeCheck; 740 } 741 742 void 743 SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 744 /* 745 In this function we generate a new loop. The new loop will contain 746 the vectorized instructions while the old loop will continue to run the 747 scalar remainder. 748 749 [ ] <-- vector loop bypass. 750 / | 751 / v 752 | [ ] <-- vector pre header. 753 | | 754 | v 755 | [ ] \ 756 | [ ]_| <-- vector loop. 757 | | 758 \ v 759 >[ ] <--- middle-block. 760 / | 761 / v 762 | [ ] <--- new preheader. 763 | | 764 | v 765 | [ ] \ 766 | [ ]_| <-- old scalar loop to handle remainder. 767 \ | 768 \ v 769 >[ ] <-- exit block. 770 ... 771 */ 772 773 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 774 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 775 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 776 assert(ExitBlock && "Must have an exit block"); 777 778 // Some loops have a single integer induction variable, while other loops 779 // don't. One example is c++ iterators that often have multiple pointer 780 // induction variables. In the code below we also support a case where we 781 // don't have a single induction variable. 782 OldInduction = Legal->getInduction(); 783 Type *IdxTy = OldInduction ? OldInduction->getType() : 784 DL->getIntPtrType(SE->getContext()); 785 786 // Find the loop boundaries. 787 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 788 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 789 790 // Get the total trip count from the count by adding 1. 791 ExitCount = SE->getAddExpr(ExitCount, 792 SE->getConstant(ExitCount->getType(), 1)); 793 794 // Expand the trip count and place the new instructions in the preheader. 795 // Notice that the pre-header does not change, only the loop body. 796 SCEVExpander Exp(*SE, "induction"); 797 798 // Count holds the overall loop count (N). 799 Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 800 BypassBlock->getTerminator()); 801 802 // The loop index does not have to start at Zero. Find the original start 803 // value from the induction PHI node. If we don't have an induction variable 804 // then we know that it starts at zero. 805 Value *StartIdx = OldInduction ? 806 OldInduction->getIncomingValueForBlock(BypassBlock): 807 ConstantInt::get(IdxTy, 0); 808 809 assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); 810 assert(BypassBlock && "Invalid loop structure"); 811 812 // Generate the code that checks in runtime if arrays overlap. 813 Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, 814 BypassBlock->getTerminator()); 815 816 // Split the single block loop into the two loop structure described above. 817 BasicBlock *VectorPH = 818 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 819 BasicBlock *VecBody = 820 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 821 BasicBlock *MiddleBlock = 822 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 823 BasicBlock *ScalarPH = 824 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 825 826 // This is the location in which we add all of the logic for bypassing 827 // the new vector loop. 828 Instruction *Loc = BypassBlock->getTerminator(); 829 830 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 831 // inside the loop. 832 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 833 834 // Generate the induction variable. 835 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 836 Constant *Step = ConstantInt::get(IdxTy, VF); 837 838 // We may need to extend the index in case there is a type mismatch. 839 // We know that the count starts at zero and does not overflow. 840 if (Count->getType() != IdxTy) { 841 // The exit count can be of pointer type. Convert it to the correct 842 // integer type. 843 if (ExitCount->getType()->isPointerTy()) 844 Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); 845 else 846 Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); 847 } 848 849 // Add the start index to the loop count to get the new end index. 850 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 851 852 // Now we need to generate the expression for N - (N % VF), which is 853 // the part that the vectorized body will execute. 854 Constant *CIVF = ConstantInt::get(IdxTy, VF); 855 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 856 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 857 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 858 "end.idx.rnd.down", Loc); 859 860 // Now, compare the new count to zero. If it is zero skip the vector loop and 861 // jump to the scalar loop. 862 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 863 IdxEndRoundDown, 864 StartIdx, 865 "cmp.zero", Loc); 866 867 // If we are using memory runtime checks, include them in. 868 if (MemoryRuntimeCheck) 869 Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, 870 "CntOrMem", Loc); 871 872 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 873 // Remove the old terminator. 874 Loc->eraseFromParent(); 875 876 // We are going to resume the execution of the scalar loop. 877 // Go over all of the induction variables that we found and fix the 878 // PHIs that are left in the scalar version of the loop. 879 // The starting values of PHI nodes depend on the counter of the last 880 // iteration in the vectorized loop. 881 // If we come from a bypass edge then we need to start from the original start 882 // value. 883 884 // This variable saves the new starting index for the scalar loop. 885 PHINode *ResumeIndex = 0; 886 LoopVectorizationLegality::InductionList::iterator I, E; 887 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 888 for (I = List->begin(), E = List->end(); I != E; ++I) { 889 PHINode *OrigPhi = I->first; 890 PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", 891 MiddleBlock->getTerminator()); 892 Value *EndValue = 0; 893 if (OrigPhi->getType()->isIntegerTy()) { 894 // Handle the integer induction counter: 895 assert(OrigPhi == OldInduction && "Unknown integer PHI"); 896 // We know what the end value is. 897 EndValue = IdxEndRoundDown; 898 // We also know which PHI node holds it. 899 ResumeIndex = ResumeVal; 900 } else { 901 // For pointer induction variables, calculate the offset using 902 // the end index. 903 EndValue = GetElementPtrInst::Create(I->second, CountRoundDown, 904 "ptr.ind.end", 905 BypassBlock->getTerminator()); 906 } 907 908 // The new PHI merges the original incoming value, in case of a bypass, 909 // or the value at the end of the vectorized loop. 910 ResumeVal->addIncoming(I->second, BypassBlock); 911 ResumeVal->addIncoming(EndValue, VecBody); 912 913 // Fix the scalar body counter (PHI node). 914 unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 915 OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 916 } 917 918 // If we are generating a new induction variable then we also need to 919 // generate the code that calculates the exit value. This value is not 920 // simply the end of the counter because we may skip the vectorized body 921 // in case of a runtime check. 922 if (!OldInduction){ 923 assert(!ResumeIndex && "Unexpected resume value found"); 924 ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 925 MiddleBlock->getTerminator()); 926 ResumeIndex->addIncoming(StartIdx, BypassBlock); 927 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 928 } 929 930 // Make sure that we found the index where scalar loop needs to continue. 931 assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 932 "Invalid resume Index"); 933 934 // Add a check in the middle block to see if we have completed 935 // all of the iterations in the first vector loop. 936 // If (N - N%VF) == N, then we *don't* need to run the remainder. 937 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 938 ResumeIndex, "cmp.n", 939 MiddleBlock->getTerminator()); 940 941 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 942 // Remove the old terminator. 943 MiddleBlock->getTerminator()->eraseFromParent(); 944 945 // Create i+1 and fill the PHINode. 946 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 947 Induction->addIncoming(StartIdx, VectorPH); 948 Induction->addIncoming(NextIdx, VecBody); 949 // Create the compare. 950 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 951 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 952 953 // Now we have two terminators. Remove the old one from the block. 954 VecBody->getTerminator()->eraseFromParent(); 955 956 // Get ready to start creating new instructions into the vectorized body. 957 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 958 959 // Register the new loop. 960 Loop* Lp = new Loop(); 961 LPM->insertLoop(Lp, OrigLoop->getParentLoop()); 962 963 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 964 965 Loop *ParentLoop = OrigLoop->getParentLoop(); 966 if (ParentLoop) { 967 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 968 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 969 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 970 } 971 972 // Save the state. 973 LoopVectorPreHeader = VectorPH; 974 LoopScalarPreHeader = ScalarPH; 975 LoopMiddleBlock = MiddleBlock; 976 LoopExitBlock = ExitBlock; 977 LoopVectorBody = VecBody; 978 LoopScalarBody = OldBasicBlock; 979 LoopBypassBlock = BypassBlock; 980 } 981 982 /// This function returns the identity element (or neutral element) for 983 /// the operation K. 984 static unsigned 985 getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 986 switch (K) { 987 case LoopVectorizationLegality::IntegerXor: 988 case LoopVectorizationLegality::IntegerAdd: 989 case LoopVectorizationLegality::IntegerOr: 990 // Adding, Xoring, Oring zero to a number does not change it. 991 return 0; 992 case LoopVectorizationLegality::IntegerMult: 993 // Multiplying a number by 1 does not change it. 994 return 1; 995 case LoopVectorizationLegality::IntegerAnd: 996 // AND-ing a number with an all-1 value does not change it. 997 return -1; 998 default: 999 llvm_unreachable("Unknown reduction kind"); 1000 } 1001 } 1002 1003 void 1004 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 1005 //===------------------------------------------------===// 1006 // 1007 // Notice: any optimization or new instruction that go 1008 // into the code below should be also be implemented in 1009 // the cost-model. 1010 // 1011 //===------------------------------------------------===// 1012 typedef SmallVector<PHINode*, 4> PhiVector; 1013 BasicBlock &BB = *OrigLoop->getHeader(); 1014 Constant *Zero = ConstantInt::get( 1015 IntegerType::getInt32Ty(BB.getContext()), 0); 1016 1017 // In order to support reduction variables we need to be able to vectorize 1018 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 1019 // steages. First, we create a new vector PHI node with no incoming edges. 1020 // We use this value when we vectorize all of the instructions that use the 1021 // PHI. Next, after all of the instructions in the block are complete we 1022 // add the new incoming edges to the PHI. At this point all of the 1023 // instructions in the basic block are vectorized, so we can use them to 1024 // construct the PHI. 1025 PhiVector RdxPHIsToFix; 1026 1027 // For each instruction in the old loop. 1028 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1029 Instruction *Inst = it; 1030 1031 switch (Inst->getOpcode()) { 1032 case Instruction::Br: 1033 // Nothing to do for PHIs and BR, since we already took care of the 1034 // loop control flow instructions. 1035 continue; 1036 case Instruction::PHI:{ 1037 PHINode* P = cast<PHINode>(Inst); 1038 // Handle reduction variables: 1039 if (Legal->getReductionVars()->count(P)) { 1040 // This is phase one of vectorizing PHIs. 1041 Type *VecTy = VectorType::get(Inst->getType(), VF); 1042 WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi", 1043 LoopVectorBody->getFirstInsertionPt()); 1044 RdxPHIsToFix.push_back(P); 1045 continue; 1046 } 1047 1048 // This PHINode must be an induction variable. 1049 // Make sure that we know about it. 1050 assert(Legal->getInductionVars()->count(P) && 1051 "Not an induction variable"); 1052 1053 if (P->getType()->isIntegerTy()) { 1054 assert(P == OldInduction && "Unexpected PHI"); 1055 WidenMap[Inst] = getBroadcastInstrs(Induction); 1056 continue; 1057 } 1058 1059 // Handle pointer inductions. 1060 assert(P->getType()->isPointerTy() && "Unexpected type."); 1061 Value *StartIdx = OldInduction ? 1062 Legal->getInductionVars()->lookup(OldInduction) : 1063 ConstantInt::get(Induction->getType(), 0); 1064 1065 // This is the pointer value coming into the loop. 1066 Value *StartPtr = Legal->getInductionVars()->lookup(P); 1067 1068 // This is the normalized GEP that starts counting at zero. 1069 Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 1070 "normalized.idx"); 1071 1072 // This is the vector of results. Notice that we don't generate vector 1073 // geps because scalar geps result in better code. 1074 Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 1075 for (unsigned int i = 0; i < VF; ++i) { 1076 Constant *Idx = ConstantInt::get(Induction->getType(), i); 1077 Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); 1078 Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep"); 1079 VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 1080 Builder.getInt32(i), 1081 "insert.gep"); 1082 } 1083 1084 WidenMap[Inst] = VecVal; 1085 continue; 1086 } 1087 case Instruction::Add: 1088 case Instruction::FAdd: 1089 case Instruction::Sub: 1090 case Instruction::FSub: 1091 case Instruction::Mul: 1092 case Instruction::FMul: 1093 case Instruction::UDiv: 1094 case Instruction::SDiv: 1095 case Instruction::FDiv: 1096 case Instruction::URem: 1097 case Instruction::SRem: 1098 case Instruction::FRem: 1099 case Instruction::Shl: 1100 case Instruction::LShr: 1101 case Instruction::AShr: 1102 case Instruction::And: 1103 case Instruction::Or: 1104 case Instruction::Xor: { 1105 // Just widen binops. 1106 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 1107 Value *A = getVectorValue(Inst->getOperand(0)); 1108 Value *B = getVectorValue(Inst->getOperand(1)); 1109 1110 // Use this vector value for all users of the original instruction. 1111 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 1112 WidenMap[Inst] = V; 1113 1114 // Update the NSW, NUW and Exact flags. 1115 BinaryOperator *VecOp = cast<BinaryOperator>(V); 1116 if (isa<OverflowingBinaryOperator>(BinOp)) { 1117 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 1118 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 1119 } 1120 if (isa<PossiblyExactOperator>(VecOp)) 1121 VecOp->setIsExact(BinOp->isExact()); 1122 break; 1123 } 1124 case Instruction::Select: { 1125 // Widen selects. 1126 // If the selector is loop invariant we can create a select 1127 // instruction with a scalar condition. Otherwise, use vector-select. 1128 Value *Cond = Inst->getOperand(0); 1129 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 1130 1131 // The condition can be loop invariant but still defined inside the 1132 // loop. This means that we can't just use the original 'cond' value. 1133 // We have to take the 'vectorized' value and pick the first lane. 1134 // Instcombine will make this a no-op. 1135 Cond = getVectorValue(Cond); 1136 if (InvariantCond) 1137 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 1138 1139 Value *Op0 = getVectorValue(Inst->getOperand(1)); 1140 Value *Op1 = getVectorValue(Inst->getOperand(2)); 1141 WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); 1142 break; 1143 } 1144 1145 case Instruction::ICmp: 1146 case Instruction::FCmp: { 1147 // Widen compares. Generate vector compares. 1148 bool FCmp = (Inst->getOpcode() == Instruction::FCmp); 1149 CmpInst *Cmp = dyn_cast<CmpInst>(Inst); 1150 Value *A = getVectorValue(Inst->getOperand(0)); 1151 Value *B = getVectorValue(Inst->getOperand(1)); 1152 if (FCmp) 1153 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 1154 else 1155 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 1156 break; 1157 } 1158 1159 case Instruction::Store: { 1160 // Attempt to issue a wide store. 1161 StoreInst *SI = dyn_cast<StoreInst>(Inst); 1162 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 1163 Value *Ptr = SI->getPointerOperand(); 1164 unsigned Alignment = SI->getAlignment(); 1165 1166 assert(!Legal->isUniform(Ptr) && 1167 "We do not allow storing to uniform addresses"); 1168 1169 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1170 1171 // This store does not use GEPs. 1172 if (!Legal->isConsecutivePtr(Ptr)) { 1173 scalarizeInstruction(Inst); 1174 break; 1175 } 1176 1177 if (Gep) { 1178 // The last index does not have to be the induction. It can be 1179 // consecutive and be a function of the index. For example A[I+1]; 1180 unsigned NumOperands = Gep->getNumOperands(); 1181 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 1182 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1183 1184 // Create the new GEP with the new induction variable. 1185 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1186 Gep2->setOperand(NumOperands - 1, LastIndex); 1187 Ptr = Builder.Insert(Gep2); 1188 } else { 1189 // Use the induction element ptr. 1190 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1191 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1192 } 1193 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 1194 Value *Val = getVectorValue(SI->getValueOperand()); 1195 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 1196 break; 1197 } 1198 case Instruction::Load: { 1199 // Attempt to issue a wide load. 1200 LoadInst *LI = dyn_cast<LoadInst>(Inst); 1201 Type *RetTy = VectorType::get(LI->getType(), VF); 1202 Value *Ptr = LI->getPointerOperand(); 1203 unsigned Alignment = LI->getAlignment(); 1204 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1205 1206 // If the pointer is loop invariant or if it is non consecutive, 1207 // scalarize the load. 1208 bool Con = Legal->isConsecutivePtr(Ptr); 1209 if (Legal->isUniform(Ptr) || !Con) { 1210 scalarizeInstruction(Inst); 1211 break; 1212 } 1213 1214 if (Gep) { 1215 // The last index does not have to be the induction. It can be 1216 // consecutive and be a function of the index. For example A[I+1]; 1217 unsigned NumOperands = Gep->getNumOperands(); 1218 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 1219 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1220 1221 // Create the new GEP with the new induction variable. 1222 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1223 Gep2->setOperand(NumOperands - 1, LastIndex); 1224 Ptr = Builder.Insert(Gep2); 1225 } else { 1226 // Use the induction element ptr. 1227 assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1228 Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); 1229 } 1230 1231 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 1232 LI = Builder.CreateLoad(Ptr); 1233 LI->setAlignment(Alignment); 1234 // Use this vector value for all users of the load. 1235 WidenMap[Inst] = LI; 1236 break; 1237 } 1238 case Instruction::ZExt: 1239 case Instruction::SExt: 1240 case Instruction::FPToUI: 1241 case Instruction::FPToSI: 1242 case Instruction::FPExt: 1243 case Instruction::PtrToInt: 1244 case Instruction::IntToPtr: 1245 case Instruction::SIToFP: 1246 case Instruction::UIToFP: 1247 case Instruction::Trunc: 1248 case Instruction::FPTrunc: 1249 case Instruction::BitCast: { 1250 /// Vectorize bitcasts. 1251 CastInst *CI = dyn_cast<CastInst>(Inst); 1252 Value *A = getVectorValue(Inst->getOperand(0)); 1253 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1254 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 1255 break; 1256 } 1257 1258 default: 1259 /// All other instructions are unsupported. Scalarize them. 1260 scalarizeInstruction(Inst); 1261 break; 1262 }// end of switch. 1263 }// end of for_each instr. 1264 1265 // At this point every instruction in the original loop is widended to 1266 // a vector form. We are almost done. Now, we need to fix the PHI nodes 1267 // that we vectorized. The PHI nodes are currently empty because we did 1268 // not want to introduce cycles. Notice that the remaining PHI nodes 1269 // that we need to fix are reduction variables. 1270 1271 // Create the 'reduced' values for each of the induction vars. 1272 // The reduced values are the vector values that we scalarize and combine 1273 // after the loop is finished. 1274 for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 1275 it != e; ++it) { 1276 PHINode *RdxPhi = *it; 1277 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 1278 assert(RdxPhi && "Unable to recover vectorized PHI"); 1279 1280 // Find the reduction variable descriptor. 1281 assert(Legal->getReductionVars()->count(RdxPhi) && 1282 "Unable to find the reduction variable"); 1283 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1284 (*Legal->getReductionVars())[RdxPhi]; 1285 1286 // We need to generate a reduction vector from the incoming scalar. 1287 // To do so, we need to generate the 'identity' vector and overide 1288 // one of the elements with the incoming scalar reduction. We need 1289 // to do it in the vector-loop preheader. 1290 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 1291 1292 // This is the vector-clone of the value that leaves the loop. 1293 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1294 Type *VecTy = VectorExit->getType(); 1295 1296 // Find the reduction identity variable. Zero for addition, or, xor, 1297 // one for multiplication, -1 for And. 1298 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 1299 VecTy->getScalarType()); 1300 1301 // This vector is the Identity vector where the first element is the 1302 // incoming scalar reduction. 1303 Value *VectorStart = Builder.CreateInsertElement(Identity, 1304 RdxDesc.StartValue, Zero); 1305 1306 // Fix the vector-loop phi. 1307 // We created the induction variable so we know that the 1308 // preheader is the first entry. 1309 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1310 1311 // Reductions do not have to start at zero. They can start with 1312 // any loop invariant values. 1313 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 1314 unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1315 Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); 1316 VecRdxPhi->addIncoming(Val, LoopVectorBody); 1317 1318 // Before each round, move the insertion point right between 1319 // the PHIs and the values we are going to write. 1320 // This allows us to write both PHINodes and the extractelement 1321 // instructions. 1322 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1323 1324 // This PHINode contains the vectorized reduction variable, or 1325 // the initial value vector, if we bypass the vector loop. 1326 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1327 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 1328 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 1329 1330 // Extract the first scalar. 1331 Value *Scalar0 = 1332 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); 1333 // Extract and reduce the remaining vector elements. 1334 for (unsigned i=1; i < VF; ++i) { 1335 Value *Scalar1 = 1336 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); 1337 switch (RdxDesc.Kind) { 1338 case LoopVectorizationLegality::IntegerAdd: 1339 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); 1340 break; 1341 case LoopVectorizationLegality::IntegerMult: 1342 Scalar0 = Builder.CreateMul(Scalar0, Scalar1); 1343 break; 1344 case LoopVectorizationLegality::IntegerOr: 1345 Scalar0 = Builder.CreateOr(Scalar0, Scalar1); 1346 break; 1347 case LoopVectorizationLegality::IntegerAnd: 1348 Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); 1349 break; 1350 case LoopVectorizationLegality::IntegerXor: 1351 Scalar0 = Builder.CreateXor(Scalar0, Scalar1); 1352 break; 1353 default: 1354 llvm_unreachable("Unknown reduction operation"); 1355 } 1356 } 1357 1358 // Now, we need to fix the users of the reduction variable 1359 // inside and outside of the scalar remainder loop. 1360 // We know that the loop is in LCSSA form. We need to update the 1361 // PHI nodes in the exit blocks. 1362 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1363 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1364 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1365 if (!LCSSAPhi) continue; 1366 1367 // All PHINodes need to have a single entry edge, or two if 1368 // we already fixed them. 1369 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1370 1371 // We found our reduction value exit-PHI. Update it with the 1372 // incoming bypass edge. 1373 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1374 // Add an edge coming from the bypass. 1375 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1376 break; 1377 } 1378 }// end of the LCSSA phi scan. 1379 1380 // Fix the scalar loop reduction variable with the incoming reduction sum 1381 // from the vector body and from the backedge value. 1382 int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1383 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. 1384 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1385 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1386 }// end of for each redux variable. 1387 } 1388 1389 void SingleBlockLoopVectorizer::updateAnalysis() { 1390 // The original basic block. 1391 SE->forgetLoop(OrigLoop); 1392 1393 // Update the dominator tree information. 1394 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1395 "Entry does not dominate exit."); 1396 1397 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1398 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1399 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1400 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1401 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1402 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1403 1404 DEBUG(DT->verifyAnalysis()); 1405 } 1406 1407 bool LoopVectorizationLegality::canVectorize() { 1408 if (!TheLoop->getLoopPreheader()) { 1409 assert(false && "No preheader!!"); 1410 DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); 1411 return false; 1412 } 1413 1414 // We can only vectorize single basic block loops. 1415 unsigned NumBlocks = TheLoop->getNumBlocks(); 1416 if (NumBlocks != 1) { 1417 DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); 1418 return false; 1419 } 1420 1421 // We need to have a loop header. 1422 BasicBlock *BB = TheLoop->getHeader(); 1423 DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); 1424 1425 // ScalarEvolution needs to be able to find the exit count. 1426 const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); 1427 if (ExitCount == SE->getCouldNotCompute()) { 1428 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1429 return false; 1430 } 1431 1432 // Do not loop-vectorize loops with a tiny trip count. 1433 unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); 1434 if (TC > 0u && TC < TinyTripCountThreshold) { 1435 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1436 "This loop is not worth vectorizing.\n"); 1437 return false; 1438 } 1439 1440 // Go over each instruction and look at memory deps. 1441 if (!canVectorizeBlock(*BB)) { 1442 DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); 1443 return false; 1444 } 1445 1446 DEBUG(dbgs() << "LV: We can vectorize this loop" << 1447 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 1448 <<"!\n"); 1449 1450 // Okay! We can vectorize. At this point we don't have any other mem analysis 1451 // which may limit our maximum vectorization factor, so just return true with 1452 // no restrictions. 1453 return true; 1454 } 1455 1456 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { 1457 1458 BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 1459 1460 // Scan the instructions in the block and look for hazards. 1461 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1462 Instruction *I = it; 1463 1464 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 1465 // This should not happen because the loop should be normalized. 1466 if (Phi->getNumIncomingValues() != 2) { 1467 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1468 return false; 1469 } 1470 1471 // This is the value coming from the preheader. 1472 Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 1473 1474 // We only look at integer and pointer phi nodes. 1475 if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) { 1476 DEBUG(dbgs() << "LV: Found a pointer induction variable.\n"); 1477 Inductions[Phi] = StartValue; 1478 continue; 1479 } else if (!Phi->getType()->isIntegerTy()) { 1480 DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 1481 return false; 1482 } 1483 1484 // Handle integer PHIs: 1485 if (isInductionVariable(Phi)) { 1486 if (Induction) { 1487 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1488 return false; 1489 } 1490 DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); 1491 Induction = Phi; 1492 Inductions[Phi] = StartValue; 1493 continue; 1494 } 1495 if (AddReductionVar(Phi, IntegerAdd)) { 1496 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1497 continue; 1498 } 1499 if (AddReductionVar(Phi, IntegerMult)) { 1500 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1501 continue; 1502 } 1503 if (AddReductionVar(Phi, IntegerOr)) { 1504 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1505 continue; 1506 } 1507 if (AddReductionVar(Phi, IntegerAnd)) { 1508 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1509 continue; 1510 } 1511 if (AddReductionVar(Phi, IntegerXor)) { 1512 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1513 continue; 1514 } 1515 1516 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1517 return false; 1518 }// end of PHI handling 1519 1520 // We still don't handle functions. 1521 CallInst *CI = dyn_cast<CallInst>(I); 1522 if (CI) { 1523 DEBUG(dbgs() << "LV: Found a call site.\n"); 1524 return false; 1525 } 1526 1527 // We do not re-vectorize vectors. 1528 if (!VectorType::isValidElementType(I->getType()) && 1529 !I->getType()->isVoidTy()) { 1530 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1531 return false; 1532 } 1533 1534 // Reduction instructions are allowed to have exit users. 1535 // All other instructions must not have external users. 1536 if (!AllowedExit.count(I)) 1537 //Check that all of the users of the loop are inside the BB. 1538 for (Value::use_iterator it = I->use_begin(), e = I->use_end(); 1539 it != e; ++it) { 1540 Instruction *U = cast<Instruction>(*it); 1541 // This user may be a reduction exit value. 1542 BasicBlock *Parent = U->getParent(); 1543 if (Parent != &BB) { 1544 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1545 return false; 1546 } 1547 } 1548 } // next instr. 1549 1550 if (!Induction) { 1551 DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 1552 assert(getInductionVars()->size() && "No induction variables"); 1553 } 1554 1555 // Don't vectorize if the memory dependencies do not allow vectorization. 1556 if (!canVectorizeMemory(BB)) 1557 return false; 1558 1559 // We now know that the loop is vectorizable! 1560 // Collect variables that will remain uniform after vectorization. 1561 std::vector<Value*> Worklist; 1562 1563 // Start with the conditional branch and walk up the block. 1564 Worklist.push_back(BB.getTerminator()->getOperand(0)); 1565 1566 while (Worklist.size()) { 1567 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1568 Worklist.pop_back(); 1569 1570 // Look at instructions inside this block. Stop when reaching PHI nodes. 1571 if (!I || I->getParent() != &BB || isa<PHINode>(I)) 1572 continue; 1573 1574 // This is a known uniform. 1575 Uniforms.insert(I); 1576 1577 // Insert all operands. 1578 for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { 1579 Worklist.push_back(I->getOperand(i)); 1580 } 1581 } 1582 1583 return true; 1584 } 1585 1586 bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { 1587 typedef SmallVector<Value*, 16> ValueVector; 1588 typedef SmallPtrSet<Value*, 16> ValueSet; 1589 // Holds the Load and Store *instructions*. 1590 ValueVector Loads; 1591 ValueVector Stores; 1592 PtrRtCheck.Pointers.clear(); 1593 PtrRtCheck.Need = false; 1594 1595 // Scan the BB and collect legal loads and stores. 1596 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1597 Instruction *I = it; 1598 1599 // If this is a load, save it. If this instruction can read from memory 1600 // but is not a load, then we quit. Notice that we don't handle function 1601 // calls that read or write. 1602 if (I->mayReadFromMemory()) { 1603 LoadInst *Ld = dyn_cast<LoadInst>(I); 1604 if (!Ld) return false; 1605 if (!Ld->isSimple()) { 1606 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1607 return false; 1608 } 1609 Loads.push_back(Ld); 1610 continue; 1611 } 1612 1613 // Save store instructions. Abort if other instructions write to memory. 1614 if (I->mayWriteToMemory()) { 1615 StoreInst *St = dyn_cast<StoreInst>(I); 1616 if (!St) return false; 1617 if (!St->isSimple()) { 1618 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1619 return false; 1620 } 1621 Stores.push_back(St); 1622 } 1623 } // next instr. 1624 1625 // Now we have two lists that hold the loads and the stores. 1626 // Next, we find the pointers that they use. 1627 1628 // Check if we see any stores. If there are no stores, then we don't 1629 // care if the pointers are *restrict*. 1630 if (!Stores.size()) { 1631 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1632 return true; 1633 } 1634 1635 // Holds the read and read-write *pointers* that we find. 1636 ValueVector Reads; 1637 ValueVector ReadWrites; 1638 1639 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1640 // multiple times on the same object. If the ptr is accessed twice, once 1641 // for read and once for write, it will only appear once (on the write 1642 // list). This is okay, since we are going to check for conflicts between 1643 // writes and between reads and writes, but not between reads and reads. 1644 ValueSet Seen; 1645 1646 ValueVector::iterator I, IE; 1647 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1648 StoreInst *ST = dyn_cast<StoreInst>(*I); 1649 assert(ST && "Bad StoreInst"); 1650 Value* Ptr = ST->getPointerOperand(); 1651 1652 if (isUniform(Ptr)) { 1653 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 1654 return false; 1655 } 1656 1657 // If we did *not* see this pointer before, insert it to 1658 // the read-write list. At this phase it is only a 'write' list. 1659 if (Seen.insert(Ptr)) 1660 ReadWrites.push_back(Ptr); 1661 } 1662 1663 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1664 LoadInst *LD = dyn_cast<LoadInst>(*I); 1665 assert(LD && "Bad LoadInst"); 1666 Value* Ptr = LD->getPointerOperand(); 1667 // If we did *not* see this pointer before, insert it to the 1668 // read list. If we *did* see it before, then it is already in 1669 // the read-write list. This allows us to vectorize expressions 1670 // such as A[i] += x; Because the address of A[i] is a read-write 1671 // pointer. This only works if the index of A[i] is consecutive. 1672 // If the address of i is unknown (for example A[B[i]]) then we may 1673 // read a few words, modify, and write a few words, and some of the 1674 // words may be written to the same address. 1675 if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) 1676 Reads.push_back(Ptr); 1677 } 1678 1679 // If we write (or read-write) to a single destination and there are no 1680 // other reads in this loop then is it safe to vectorize. 1681 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1682 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1683 return true; 1684 } 1685 1686 // Find pointers with computable bounds. We are going to use this information 1687 // to place a runtime bound check. 1688 bool RT = true; 1689 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 1690 if (hasComputableBounds(*I)) { 1691 PtrRtCheck.insert(SE, TheLoop, *I); 1692 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1693 } else { 1694 RT = false; 1695 break; 1696 } 1697 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 1698 if (hasComputableBounds(*I)) { 1699 PtrRtCheck.insert(SE, TheLoop, *I); 1700 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1701 } else { 1702 RT = false; 1703 break; 1704 } 1705 1706 // Check that we did not collect too many pointers or found a 1707 // unsizeable pointer. 1708 if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 1709 PtrRtCheck.reset(); 1710 RT = false; 1711 } 1712 1713 PtrRtCheck.Need = RT; 1714 1715 if (RT) { 1716 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 1717 } 1718 1719 // Now that the pointers are in two lists (Reads and ReadWrites), we 1720 // can check that there are no conflicts between each of the writes and 1721 // between the writes to the reads. 1722 ValueSet WriteObjects; 1723 ValueVector TempObjects; 1724 1725 // Check that the read-writes do not conflict with other read-write 1726 // pointers. 1727 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1728 GetUnderlyingObjects(*I, TempObjects, DL); 1729 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1730 it != e; ++it) { 1731 if (!isIdentifiedObject(*it)) { 1732 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1733 return RT; 1734 } 1735 if (!WriteObjects.insert(*it)) { 1736 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1737 << **it <<"\n"); 1738 return RT; 1739 } 1740 } 1741 TempObjects.clear(); 1742 } 1743 1744 /// Check that the reads don't conflict with the read-writes. 1745 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1746 GetUnderlyingObjects(*I, TempObjects, DL); 1747 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1748 it != e; ++it) { 1749 if (!isIdentifiedObject(*it)) { 1750 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1751 return RT; 1752 } 1753 if (WriteObjects.count(*it)) { 1754 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1755 << **it <<"\n"); 1756 return RT; 1757 } 1758 } 1759 TempObjects.clear(); 1760 } 1761 1762 // It is safe to vectorize and we don't need any runtime checks. 1763 DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); 1764 PtrRtCheck.reset(); 1765 return true; 1766 } 1767 1768 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1769 ReductionKind Kind) { 1770 if (Phi->getNumIncomingValues() != 2) 1771 return false; 1772 1773 // Find the possible incoming reduction variable. 1774 BasicBlock *BB = Phi->getParent(); 1775 int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); 1776 int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. 1777 Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); 1778 1779 // ExitInstruction is the single value which is used outside the loop. 1780 // We only allow for a single reduction value to be used outside the loop. 1781 // This includes users of the reduction, variables (which form a cycle 1782 // which ends in the phi node). 1783 Instruction *ExitInstruction = 0; 1784 1785 // Iter is our iterator. We start with the PHI node and scan for all of the 1786 // users of this instruction. All users must be instructions which can be 1787 // used as reduction variables (such as ADD). We may have a single 1788 // out-of-block user. They cycle must end with the original PHI. 1789 // Also, we can't have multiple block-local users. 1790 Instruction *Iter = Phi; 1791 while (true) { 1792 // Any reduction instr must be of one of the allowed kinds. 1793 if (!isReductionInstr(Iter, Kind)) 1794 return false; 1795 1796 // Did we found a user inside this block ? 1797 bool FoundInBlockUser = false; 1798 // Did we reach the initial PHI node ? 1799 bool FoundStartPHI = false; 1800 1801 // If the instruction has no users then this is a broken 1802 // chain and can't be a reduction variable. 1803 if (Iter->use_empty()) 1804 return false; 1805 1806 // For each of the *users* of iter. 1807 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1808 it != e; ++it) { 1809 Instruction *U = cast<Instruction>(*it); 1810 // We already know that the PHI is a user. 1811 if (U == Phi) { 1812 FoundStartPHI = true; 1813 continue; 1814 } 1815 // Check if we found the exit user. 1816 BasicBlock *Parent = U->getParent(); 1817 if (Parent != BB) { 1818 // We must have a single exit instruction. 1819 if (ExitInstruction != 0) 1820 return false; 1821 ExitInstruction = Iter; 1822 } 1823 // We can't have multiple inside users. 1824 if (FoundInBlockUser) 1825 return false; 1826 FoundInBlockUser = true; 1827 Iter = U; 1828 } 1829 1830 // We found a reduction var if we have reached the original 1831 // phi node and we only have a single instruction with out-of-loop 1832 // users. 1833 if (FoundStartPHI && ExitInstruction) { 1834 // This instruction is allowed to have out-of-loop users. 1835 AllowedExit.insert(ExitInstruction); 1836 1837 // Save the description of this reduction variable. 1838 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1839 Reductions[Phi] = RD; 1840 return true; 1841 } 1842 } 1843 } 1844 1845 bool 1846 LoopVectorizationLegality::isReductionInstr(Instruction *I, 1847 ReductionKind Kind) { 1848 switch (I->getOpcode()) { 1849 default: 1850 return false; 1851 case Instruction::PHI: 1852 // possibly. 1853 return true; 1854 case Instruction::Add: 1855 case Instruction::Sub: 1856 return Kind == IntegerAdd; 1857 case Instruction::Mul: 1858 return Kind == IntegerMult; 1859 case Instruction::And: 1860 return Kind == IntegerAnd; 1861 case Instruction::Or: 1862 return Kind == IntegerOr; 1863 case Instruction::Xor: 1864 return Kind == IntegerXor; 1865 } 1866 } 1867 1868 bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1869 Type *PhiTy = Phi->getType(); 1870 // We only handle integer and pointer inductions variables. 1871 if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 1872 return false; 1873 1874 // Check that the PHI is consecutive and starts at zero. 1875 const SCEV *PhiScev = SE->getSCEV(Phi); 1876 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1877 if (!AR) { 1878 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1879 return false; 1880 } 1881 const SCEV *Step = AR->getStepRecurrence(*SE); 1882 1883 // Integer inductions need to have a stride of one. 1884 if (PhiTy->isIntegerTy()) 1885 return Step->isOne(); 1886 1887 // Calculate the pointer stride and check if it is consecutive. 1888 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 1889 if (!C) return false; 1890 1891 assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 1892 uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 1893 return (C->getValue()->equalsInt(Size)); 1894 } 1895 1896 bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 1897 const SCEV *PhiScev = SE->getSCEV(Ptr); 1898 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1899 if (!AR) 1900 return false; 1901 1902 return AR->isAffine(); 1903 } 1904 1905 unsigned 1906 LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { 1907 if (!VTTI) { 1908 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 1909 return 1; 1910 } 1911 1912 float Cost = expectedCost(1); 1913 unsigned Width = 1; 1914 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 1915 for (unsigned i=2; i <= VF; i*=2) { 1916 // Notice that the vector loop needs to be executed less times, so 1917 // we need to divide the cost of the vector loops by the width of 1918 // the vector elements. 1919 float VectorCost = expectedCost(i) / (float)i; 1920 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 1921 (int)VectorCost << ".\n"); 1922 if (VectorCost < Cost) { 1923 Cost = VectorCost; 1924 Width = i; 1925 } 1926 } 1927 1928 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 1929 return Width; 1930 } 1931 1932 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 1933 // We can only estimate the cost of single basic block loops. 1934 assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); 1935 1936 BasicBlock *BB = TheLoop->getHeader(); 1937 unsigned Cost = 0; 1938 1939 // For each instruction in the old loop. 1940 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1941 Instruction *Inst = it; 1942 unsigned C = getInstructionCost(Inst, VF); 1943 Cost += C; 1944 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << 1945 " For instruction: "<< *Inst << "\n"); 1946 } 1947 1948 return Cost; 1949 } 1950 1951 unsigned 1952 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 1953 assert(VTTI && "Invalid vector target transformation info"); 1954 1955 // If we know that this instruction will remain uniform, check the cost of 1956 // the scalar version. 1957 if (Legal->isUniformAfterVectorization(I)) 1958 VF = 1; 1959 1960 Type *RetTy = I->getType(); 1961 Type *VectorTy = ToVectorTy(RetTy, VF); 1962 1963 1964 // TODO: We need to estimate the cost of intrinsic calls. 1965 switch (I->getOpcode()) { 1966 case Instruction::GetElementPtr: 1967 // We mark this instruction as zero-cost because scalar GEPs are usually 1968 // lowered to the intruction addressing mode. At the moment we don't 1969 // generate vector geps. 1970 return 0; 1971 case Instruction::Br: { 1972 return VTTI->getCFInstrCost(I->getOpcode()); 1973 } 1974 case Instruction::PHI: 1975 return 0; 1976 case Instruction::Add: 1977 case Instruction::FAdd: 1978 case Instruction::Sub: 1979 case Instruction::FSub: 1980 case Instruction::Mul: 1981 case Instruction::FMul: 1982 case Instruction::UDiv: 1983 case Instruction::SDiv: 1984 case Instruction::FDiv: 1985 case Instruction::URem: 1986 case Instruction::SRem: 1987 case Instruction::FRem: 1988 case Instruction::Shl: 1989 case Instruction::LShr: 1990 case Instruction::AShr: 1991 case Instruction::And: 1992 case Instruction::Or: 1993 case Instruction::Xor: { 1994 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 1995 } 1996 case Instruction::Select: { 1997 SelectInst *SI = cast<SelectInst>(I); 1998 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 1999 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 2000 Type *CondTy = SI->getCondition()->getType(); 2001 if (ScalarCond) 2002 CondTy = VectorType::get(CondTy, VF); 2003 2004 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 2005 } 2006 case Instruction::ICmp: 2007 case Instruction::FCmp: { 2008 Type *ValTy = I->getOperand(0)->getType(); 2009 VectorTy = ToVectorTy(ValTy, VF); 2010 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 2011 } 2012 case Instruction::Store: { 2013 StoreInst *SI = cast<StoreInst>(I); 2014 Type *ValTy = SI->getValueOperand()->getType(); 2015 VectorTy = ToVectorTy(ValTy, VF); 2016 2017 if (VF == 1) 2018 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 2019 SI->getAlignment(), SI->getPointerAddressSpace()); 2020 2021 // Scalarized stores. 2022 if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { 2023 unsigned Cost = 0; 2024 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2025 ValTy); 2026 // The cost of extracting from the value vector. 2027 Cost += VF * (ExtCost); 2028 // The cost of the scalar stores. 2029 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2030 ValTy->getScalarType(), 2031 SI->getAlignment(), 2032 SI->getPointerAddressSpace()); 2033 return Cost; 2034 } 2035 2036 // Wide stores. 2037 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 2038 SI->getPointerAddressSpace()); 2039 } 2040 case Instruction::Load: { 2041 LoadInst *LI = cast<LoadInst>(I); 2042 2043 if (VF == 1) 2044 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 2045 LI->getAlignment(), 2046 LI->getPointerAddressSpace()); 2047 2048 // Scalarized loads. 2049 if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { 2050 unsigned Cost = 0; 2051 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 2052 // The cost of inserting the loaded value into the result vector. 2053 Cost += VF * (InCost); 2054 // The cost of the scalar stores. 2055 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 2056 RetTy->getScalarType(), 2057 LI->getAlignment(), 2058 LI->getPointerAddressSpace()); 2059 return Cost; 2060 } 2061 2062 // Wide loads. 2063 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 2064 LI->getPointerAddressSpace()); 2065 } 2066 case Instruction::ZExt: 2067 case Instruction::SExt: 2068 case Instruction::FPToUI: 2069 case Instruction::FPToSI: 2070 case Instruction::FPExt: 2071 case Instruction::PtrToInt: 2072 case Instruction::IntToPtr: 2073 case Instruction::SIToFP: 2074 case Instruction::UIToFP: 2075 case Instruction::Trunc: 2076 case Instruction::FPTrunc: 2077 case Instruction::BitCast: { 2078 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 2079 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 2080 } 2081 default: { 2082 // We are scalarizing the instruction. Return the cost of the scalar 2083 // instruction, plus the cost of insert and extract into vector 2084 // elements, times the vector width. 2085 unsigned Cost = 0; 2086 2087 bool IsVoid = RetTy->isVoidTy(); 2088 2089 unsigned InsCost = (IsVoid ? 0 : 2090 VTTI->getInstrCost(Instruction::InsertElement, 2091 VectorTy)); 2092 2093 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 2094 VectorTy); 2095 2096 // The cost of inserting the results plus extracting each one of the 2097 // operands. 2098 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 2099 2100 // The cost of executing VF copies of the scalar instruction. 2101 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 2102 return Cost; 2103 } 2104 }// end of switch. 2105 } 2106 2107 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 2108 if (Scalar->isVoidTy() || VF == 1) 2109 return Scalar; 2110 return VectorType::get(Scalar, VF); 2111 } 2112 2113 } // namespace 2114 2115 char LoopVectorize::ID = 0; 2116 static const char lv_name[] = "Loop Vectorization"; 2117 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 2118 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 2119 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 2120 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 2121 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 2122 2123 namespace llvm { 2124 Pass *createLoopVectorizePass() { 2125 return new LoopVectorize(); 2126 } 2127 } 2128 2129