1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11 // and generates target-independent LLVM-IR. Legalization of the IR is done 12 // in the codegen. However, the vectorizes uses (will use) the codegen 13 // interfaces to generate IR that is likely to result in an optimal binary. 14 // 15 // The loop vectorizer combines consecutive loop iteration into a single 16 // 'wide' iteration. After this transformation the index is incremented 17 // by the SIMD vector width, and not by one. 18 // 19 // This pass has three parts: 20 // 1. The main loop pass that drives the different parts. 21 // 2. LoopVectorizationLegality - A unit that checks for the legality 22 // of the vectorization. 23 // 3. SingleBlockLoopVectorizer - A unit that performs the actual 24 // widening of instructions. 25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 26 // of vectorization. It decides on the optimal vector width, which 27 // can be one, if vectorization is not profitable. 28 //===----------------------------------------------------------------------===// 29 // 30 // The reduction-variable vectorization is based on the paper: 31 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 32 // 33 // Variable uniformity checks are inspired by: 34 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 35 // 36 // Other ideas/concepts are from: 37 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 38 // 39 //===----------------------------------------------------------------------===// 40 #define LV_NAME "loop-vectorize" 41 #define DEBUG_TYPE LV_NAME 42 #include "llvm/Constants.h" 43 #include "llvm/DerivedTypes.h" 44 #include "llvm/Instructions.h" 45 #include "llvm/LLVMContext.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Analysis/LoopPass.h" 48 #include "llvm/Value.h" 49 #include "llvm/Function.h" 50 #include "llvm/Analysis/Verifier.h" 51 #include "llvm/Module.h" 52 #include "llvm/Type.h" 53 #include "llvm/ADT/SmallVector.h" 54 #include "llvm/ADT/StringExtras.h" 55 #include "llvm/Analysis/AliasAnalysis.h" 56 #include "llvm/Analysis/AliasSetTracker.h" 57 #include "llvm/Analysis/ScalarEvolution.h" 58 #include "llvm/Analysis/Dominators.h" 59 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 60 #include "llvm/Analysis/ScalarEvolutionExpander.h" 61 #include "llvm/Analysis/LoopInfo.h" 62 #include "llvm/Analysis/ValueTracking.h" 63 #include "llvm/Transforms/Scalar.h" 64 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 65 #include "llvm/TargetTransformInfo.h" 66 #include "llvm/Support/CommandLine.h" 67 #include "llvm/Support/Debug.h" 68 #include "llvm/Support/raw_ostream.h" 69 #include "llvm/DataLayout.h" 70 #include "llvm/Transforms/Utils/Local.h" 71 #include <algorithm> 72 using namespace llvm; 73 74 static cl::opt<unsigned> 75 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 76 cl::desc("Set the default vectorization width. Zero is autoselect.")); 77 78 /// We don't vectorize loops with a known constant trip count below this number. 79 const unsigned TinyTripCountThreshold = 16; 80 81 /// When performing a runtime memory check, do not check more than this 82 /// number of pointers. Notice that the check is quadratic! 83 const unsigned RuntimeMemoryCheckThreshold = 2; 84 85 namespace { 86 87 // Forward declarations. 88 class LoopVectorizationLegality; 89 class LoopVectorizationCostModel; 90 91 /// SingleBlockLoopVectorizer vectorizes loops which contain only one basic 92 /// block to a specified vectorization factor (VF). 93 /// This class performs the widening of scalars into vectors, or multiple 94 /// scalars. This class also implements the following features: 95 /// * It inserts an epilogue loop for handling loops that don't have iteration 96 /// counts that are known to be a multiple of the vectorization factor. 97 /// * It handles the code generation for reduction variables. 98 /// * Scalarization (implementation using scalars) of un-vectorizable 99 /// instructions. 100 /// SingleBlockLoopVectorizer does not perform any vectorization-legality 101 /// checks, and relies on the caller to check for the different legality 102 /// aspects. The SingleBlockLoopVectorizer relies on the 103 /// LoopVectorizationLegality class to provide information about the induction 104 /// and reduction variables that were found to a given vectorization factor. 105 class SingleBlockLoopVectorizer { 106 public: 107 /// Ctor. 108 SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, 109 DominatorTree *dt, LPPassManager *Lpm, 110 unsigned VecWidth): 111 OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth), 112 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 113 114 // Perform the actual loop widening (vectorization). 115 void vectorize(LoopVectorizationLegality *Legal) { 116 ///Create a new empty loop. Unlink the old loop and connect the new one. 117 createEmptyLoop(Legal); 118 /// Widen each instruction in the old loop to a new one in the new loop. 119 /// Use the Legality module to find the induction and reduction variables. 120 vectorizeLoop(Legal); 121 // Register the new loop and update the analysis passes. 122 updateAnalysis(); 123 } 124 125 private: 126 /// Create an empty loop, based on the loop ranges of the old loop. 127 void createEmptyLoop(LoopVectorizationLegality *Legal); 128 /// Copy and widen the instructions from the old loop. 129 void vectorizeLoop(LoopVectorizationLegality *Legal); 130 /// Insert the new loop to the loop hierarchy and pass manager 131 /// and update the analysis passes. 132 void updateAnalysis(); 133 134 /// This instruction is un-vectorizable. Implement it as a sequence 135 /// of scalars. 136 void scalarizeInstruction(Instruction *Instr); 137 138 /// Create a broadcast instruction. This method generates a broadcast 139 /// instruction (shuffle) for loop invariant values and for the induction 140 /// value. If this is the induction variable then we extend it to N, N+1, ... 141 /// this is needed because each iteration in the loop corresponds to a SIMD 142 /// element. 143 Value *getBroadcastInstrs(Value *V); 144 145 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 146 /// for each element in the vector. Starting from zero. 147 Value *getConsecutiveVector(Value* Val); 148 149 /// When we go over instructions in the basic block we rely on previous 150 /// values within the current basic block or on loop invariant values. 151 /// When we widen (vectorize) values we place them in the map. If the values 152 /// are not within the map, they have to be loop invariant, so we simply 153 /// broadcast them into a vector. 154 Value *getVectorValue(Value *V); 155 156 /// Get a uniform vector of constant integers. We use this to get 157 /// vectors of ones and zeros for the reduction code. 158 Constant* getUniformVector(unsigned Val, Type* ScalarTy); 159 160 typedef DenseMap<Value*, Value*> ValueMap; 161 162 /// The original loop. 163 Loop *OrigLoop; 164 // Scev analysis to use. 165 ScalarEvolution *SE; 166 // Loop Info. 167 LoopInfo *LI; 168 // Dominator Tree. 169 DominatorTree *DT; 170 // Loop Pass Manager; 171 LPPassManager *LPM; 172 // The vectorization factor to use. 173 unsigned VF; 174 175 // The builder that we use 176 IRBuilder<> Builder; 177 178 // --- Vectorization state --- 179 180 /// The vector-loop preheader. 181 BasicBlock *LoopVectorPreHeader; 182 /// The scalar-loop preheader. 183 BasicBlock *LoopScalarPreHeader; 184 /// Middle Block between the vector and the scalar. 185 BasicBlock *LoopMiddleBlock; 186 ///The ExitBlock of the scalar loop. 187 BasicBlock *LoopExitBlock; 188 ///The vector loop body. 189 BasicBlock *LoopVectorBody; 190 ///The scalar loop body. 191 BasicBlock *LoopScalarBody; 192 ///The first bypass block. 193 BasicBlock *LoopBypassBlock; 194 195 /// The new Induction variable which was added to the new block. 196 PHINode *Induction; 197 /// The induction variable of the old basic block. 198 PHINode *OldInduction; 199 // Maps scalars to widened vectors. 200 ValueMap WidenMap; 201 }; 202 203 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 204 /// to what vectorization factor. 205 /// This class does not look at the profitability of vectorization, only the 206 /// legality. This class has two main kinds of checks: 207 /// * Memory checks - The code in canVectorizeMemory checks if vectorization 208 /// will change the order of memory accesses in a way that will change the 209 /// correctness of the program. 210 /// * Scalars checks - The code in canVectorizeBlock checks for a number 211 /// of different conditions, such as the availability of a single induction 212 /// variable, that all types are supported and vectorize-able, etc. 213 /// This code reflects the capabilities of SingleBlockLoopVectorizer. 214 /// This class is also used by SingleBlockLoopVectorizer for identifying 215 /// induction variable and the different reduction variables. 216 class LoopVectorizationLegality { 217 public: 218 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): 219 TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } 220 221 /// This represents the kinds of reductions that we support. 222 enum ReductionKind { 223 NoReduction, /// Not a reduction. 224 IntegerAdd, /// Sum of numbers. 225 IntegerMult, /// Product of numbers. 226 IntegerOr, /// Bitwise or logical OR of numbers. 227 IntegerAnd, /// Bitwise or logical AND of numbers. 228 IntegerXor /// Bitwise or logical XOR of numbers. 229 }; 230 231 /// This POD struct holds information about reduction variables. 232 struct ReductionDescriptor { 233 // Default C'tor 234 ReductionDescriptor(): 235 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} 236 237 // C'tor. 238 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): 239 StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 240 241 // The starting value of the reduction. 242 // It does not have to be zero! 243 Value *StartValue; 244 // The instruction who's value is used outside the loop. 245 Instruction *LoopExitInstr; 246 // The kind of the reduction. 247 ReductionKind Kind; 248 }; 249 250 // This POD struct holds information about the memory runtime legality 251 // check that a group of pointers do not overlap. 252 struct RuntimePointerCheck { 253 /// This flag indicates if we need to add the runtime check. 254 bool Need; 255 /// Holds the pointers that we need to check. 256 SmallVector<Value*, 2> Pointers; 257 }; 258 259 /// ReductionList contains the reduction descriptors for all 260 /// of the reductions that were found in the loop. 261 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 262 263 /// Returns true if it is legal to vectorize this loop. 264 /// This does not mean that it is profitable to vectorize this 265 /// loop, only that it is legal to do so. 266 bool canVectorize(); 267 268 /// Returns the Induction variable. 269 PHINode *getInduction() {return Induction;} 270 271 /// Returns the reduction variables found in the loop. 272 ReductionList *getReductionVars() { return &Reductions; } 273 274 /// Check if the pointer returned by this GEP is consecutive 275 /// when the index is vectorized. This happens when the last 276 /// index of the GEP is consecutive, like the induction variable. 277 /// This check allows us to vectorize A[idx] into a wide load/store. 278 bool isConsecutiveGep(Value *Ptr); 279 280 /// Returns true if the value V is uniform within the loop. 281 bool isUniform(Value *V); 282 283 /// Returns true if this instruction will remain scalar after vectorization. 284 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} 285 286 /// Returns the information that we collected about runtime memory check. 287 RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; } 288 private: 289 /// Check if a single basic block loop is vectorizable. 290 /// At this point we know that this is a loop with a constant trip count 291 /// and we only need to check individual instructions. 292 bool canVectorizeBlock(BasicBlock &BB); 293 294 /// When we vectorize loops we may change the order in which 295 /// we read and write from memory. This method checks if it is 296 /// legal to vectorize the code, considering only memory constrains. 297 /// Returns true if BB is vectorizable 298 bool canVectorizeMemory(BasicBlock &BB); 299 300 /// Returns True, if 'Phi' is the kind of reduction variable for type 301 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 302 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 303 /// Returns true if the instruction I can be a reduction variable of type 304 /// 'Kind'. 305 bool isReductionInstr(Instruction *I, ReductionKind Kind); 306 /// Returns True, if 'Phi' is an induction variable. 307 bool isInductionVariable(PHINode *Phi); 308 /// Return true if can compute the address bounds of Ptr within the loop. 309 bool hasComputableBounds(Value *Ptr); 310 311 /// The loop that we evaluate. 312 Loop *TheLoop; 313 /// Scev analysis. 314 ScalarEvolution *SE; 315 /// DataLayout analysis. 316 DataLayout *DL; 317 318 // --- vectorization state --- // 319 320 /// Holds the induction variable. 321 PHINode *Induction; 322 /// Holds the reduction variables. 323 ReductionList Reductions; 324 /// Allowed outside users. This holds the reduction 325 /// vars which can be accessed from outside the loop. 326 SmallPtrSet<Value*, 4> AllowedExit; 327 /// This set holds the variables which are known to be uniform after 328 /// vectorization. 329 SmallPtrSet<Instruction*, 4> Uniforms; 330 /// We need to check that all of the pointers in this list are disjoint 331 /// at runtime. 332 RuntimePointerCheck PtrRtCheck; 333 }; 334 335 /// LoopVectorizationCostModel - estimates the expected speedups due to 336 /// vectorization. 337 /// In many cases vectorization is not profitable. This can happen because 338 /// of a number of reasons. In this class we mainly attempt to predict 339 /// the expected speedup/slowdowns due to the supported instruction set. 340 /// We use the VectorTargetTransformInfo to query the different backends 341 /// for the cost of different operations. 342 class LoopVectorizationCostModel { 343 public: 344 /// C'tor. 345 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, 346 LoopVectorizationLegality *Leg, 347 const VectorTargetTransformInfo *Vtti): 348 TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } 349 350 /// Returns the most profitable vectorization factor for the loop that is 351 /// smaller or equal to the VF argument. This method checks every power 352 /// of two up to VF. 353 unsigned findBestVectorizationFactor(unsigned VF = 8); 354 355 private: 356 /// Returns the expected execution cost. The unit of the cost does 357 /// not matter because we use the 'cost' units to compare different 358 /// vector widths. The cost that is returned is *not* normalized by 359 /// the factor width. 360 unsigned expectedCost(unsigned VF); 361 362 /// Returns the execution time cost of an instruction for a given vector 363 /// width. Vector width of one means scalar. 364 unsigned getInstructionCost(Instruction *I, unsigned VF); 365 366 /// A helper function for converting Scalar types to vector types. 367 /// If the incoming type is void, we return void. If the VF is 1, we return 368 /// the scalar type. 369 static Type* ToVectorTy(Type *Scalar, unsigned VF); 370 371 /// The loop that we evaluate. 372 Loop *TheLoop; 373 /// Scev analysis. 374 ScalarEvolution *SE; 375 376 /// Vectorization legality. 377 LoopVectorizationLegality *Legal; 378 /// Vector target information. 379 const VectorTargetTransformInfo *VTTI; 380 }; 381 382 struct LoopVectorize : public LoopPass { 383 static char ID; // Pass identification, replacement for typeid 384 385 LoopVectorize() : LoopPass(ID) { 386 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 387 } 388 389 ScalarEvolution *SE; 390 DataLayout *DL; 391 LoopInfo *LI; 392 TargetTransformInfo *TTI; 393 DominatorTree *DT; 394 395 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 396 // We only vectorize innermost loops. 397 if (!L->empty()) 398 return false; 399 400 SE = &getAnalysis<ScalarEvolution>(); 401 DL = getAnalysisIfAvailable<DataLayout>(); 402 LI = &getAnalysis<LoopInfo>(); 403 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 404 DT = &getAnalysis<DominatorTree>(); 405 406 DEBUG(dbgs() << "LV: Checking a loop in \"" << 407 L->getHeader()->getParent()->getName() << "\"\n"); 408 409 // Check if it is legal to vectorize the loop. 410 LoopVectorizationLegality LVL(L, SE, DL); 411 if (!LVL.canVectorize()) { 412 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 413 return false; 414 } 415 416 // Select the preffered vectorization factor. 417 unsigned VF = 1; 418 if (VectorizationFactor == 0) { 419 const VectorTargetTransformInfo *VTTI = 0; 420 if (TTI) 421 VTTI = TTI->getVectorTargetTransformInfo(); 422 // Use the cost model. 423 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 424 VF = CM.findBestVectorizationFactor(); 425 426 if (VF == 1) { 427 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 428 return false; 429 } 430 431 } else { 432 // Use the user command flag. 433 VF = VectorizationFactor; 434 } 435 436 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 437 L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< 438 "\n"); 439 440 // If we decided that it is *legal* to vectorizer the loop then do it. 441 SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF); 442 LB.vectorize(&LVL); 443 444 DEBUG(verifyFunction(*L->getHeader()->getParent())); 445 return true; 446 } 447 448 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 449 LoopPass::getAnalysisUsage(AU); 450 AU.addRequiredID(LoopSimplifyID); 451 AU.addRequiredID(LCSSAID); 452 AU.addRequired<LoopInfo>(); 453 AU.addRequired<ScalarEvolution>(); 454 AU.addRequired<DominatorTree>(); 455 AU.addPreserved<LoopInfo>(); 456 AU.addPreserved<DominatorTree>(); 457 } 458 459 }; 460 461 Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { 462 // Instructions that access the old induction variable 463 // actually want to get the new one. 464 if (V == OldInduction) 465 V = Induction; 466 // Create the types. 467 LLVMContext &C = V->getContext(); 468 Type *VTy = VectorType::get(V->getType(), VF); 469 Type *I32 = IntegerType::getInt32Ty(C); 470 Constant *Zero = ConstantInt::get(I32, 0); 471 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 472 Value *UndefVal = UndefValue::get(VTy); 473 // Insert the value into a new vector. 474 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 475 // Broadcast the scalar into all locations in the vector. 476 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 477 "broadcast"); 478 // We are accessing the induction variable. Make sure to promote the 479 // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. 480 if (V == Induction) 481 return getConsecutiveVector(Shuf); 482 return Shuf; 483 } 484 485 Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { 486 assert(Val->getType()->isVectorTy() && "Must be a vector"); 487 assert(Val->getType()->getScalarType()->isIntegerTy() && 488 "Elem must be an integer"); 489 // Create the types. 490 Type *ITy = Val->getType()->getScalarType(); 491 VectorType *Ty = cast<VectorType>(Val->getType()); 492 unsigned VLen = Ty->getNumElements(); 493 SmallVector<Constant*, 8> Indices; 494 495 // Create a vector of consecutive numbers from zero to VF. 496 for (unsigned i = 0; i < VLen; ++i) 497 Indices.push_back(ConstantInt::get(ITy, i)); 498 499 // Add the consecutive indices to the vector value. 500 Constant *Cv = ConstantVector::get(Indices); 501 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 502 return Builder.CreateAdd(Val, Cv, "induction"); 503 } 504 505 bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { 506 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 507 if (!Gep) 508 return false; 509 510 unsigned NumOperands = Gep->getNumOperands(); 511 Value *LastIndex = Gep->getOperand(NumOperands - 1); 512 513 // Check that all of the gep indices are uniform except for the last. 514 for (unsigned i = 0; i < NumOperands - 1; ++i) 515 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 516 return false; 517 518 // We can emit wide load/stores only of the last index is the induction 519 // variable. 520 const SCEV *Last = SE->getSCEV(LastIndex); 521 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 522 const SCEV *Step = AR->getStepRecurrence(*SE); 523 524 // The memory is consecutive because the last index is consecutive 525 // and all other indices are loop invariant. 526 if (Step->isOne()) 527 return true; 528 } 529 530 return false; 531 } 532 533 bool LoopVectorizationLegality::isUniform(Value *V) { 534 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 535 } 536 537 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { 538 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 539 // If we saved a vectorized copy of V, use it. 540 Value *&MapEntry = WidenMap[V]; 541 if (MapEntry) 542 return MapEntry; 543 544 // Broadcast V and save the value for future uses. 545 Value *B = getBroadcastInstrs(V); 546 MapEntry = B; 547 return B; 548 } 549 550 Constant* 551 SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 552 SmallVector<Constant*, 8> Indices; 553 // Create a vector of consecutive numbers from zero to VF. 554 for (unsigned i = 0; i < VF; ++i) 555 Indices.push_back(ConstantInt::get(ScalarTy, Val, true)); 556 557 // Add the consecutive indices to the vector value. 558 return ConstantVector::get(Indices); 559 } 560 561 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 562 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 563 // Holds vector parameters or scalars, in case of uniform vals. 564 SmallVector<Value*, 8> Params; 565 566 // Find all of the vectorized parameters. 567 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 568 Value *SrcOp = Instr->getOperand(op); 569 570 // If we are accessing the old induction variable, use the new one. 571 if (SrcOp == OldInduction) { 572 Params.push_back(getBroadcastInstrs(Induction)); 573 continue; 574 } 575 576 // Try using previously calculated values. 577 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 578 579 // If the src is an instruction that appeared earlier in the basic block 580 // then it should already be vectorized. 581 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 582 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 583 // The parameter is a vector value from earlier. 584 Params.push_back(WidenMap[SrcInst]); 585 } else { 586 // The parameter is a scalar from outside the loop. Maybe even a constant. 587 Params.push_back(SrcOp); 588 } 589 } 590 591 assert(Params.size() == Instr->getNumOperands() && 592 "Invalid number of operands"); 593 594 // Does this instruction return a value ? 595 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 596 Value *VecResults = 0; 597 598 // If we have a return value, create an empty vector. We place the scalarized 599 // instructions in this vector. 600 if (!IsVoidRetTy) 601 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 602 603 // For each scalar that we create: 604 for (unsigned i = 0; i < VF; ++i) { 605 Instruction *Cloned = Instr->clone(); 606 if (!IsVoidRetTy) 607 Cloned->setName(Instr->getName() + ".cloned"); 608 // Replace the operands of the cloned instrucions with extracted scalars. 609 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 610 Value *Op = Params[op]; 611 // Param is a vector. Need to extract the right lane. 612 if (Op->getType()->isVectorTy()) 613 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 614 Cloned->setOperand(op, Op); 615 } 616 617 // Place the cloned scalar in the new loop. 618 Builder.Insert(Cloned); 619 620 // If the original scalar returns a value we need to place it in a vector 621 // so that future users will be able to use it. 622 if (!IsVoidRetTy) 623 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 624 Builder.getInt32(i)); 625 } 626 627 if (!IsVoidRetTy) 628 WidenMap[Instr] = VecResults; 629 } 630 631 void 632 SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 633 /* 634 In this function we generate a new loop. The new loop will contain 635 the vectorized instructions while the old loop will continue to run the 636 scalar remainder. 637 638 [ ] <-- vector loop bypass. 639 / | 640 / v 641 | [ ] <-- vector pre header. 642 | | 643 | v 644 | [ ] \ 645 | [ ]_| <-- vector loop. 646 | | 647 \ v 648 >[ ] <--- middle-block. 649 / | 650 / v 651 | [ ] <--- new preheader. 652 | | 653 | v 654 | [ ] \ 655 | [ ]_| <-- old scalar loop to handle remainder. 656 \ | 657 \ v 658 >[ ] <-- exit block. 659 ... 660 */ 661 662 OldInduction = Legal->getInduction(); 663 assert(OldInduction && "We must have a single phi node."); 664 Type *IdxTy = OldInduction->getType(); 665 666 // Find the loop boundaries. 667 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 668 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 669 670 // Get the total trip count from the count by adding 1. 671 ExitCount = SE->getAddExpr(ExitCount, 672 SE->getConstant(ExitCount->getType(), 1)); 673 // We may need to extend the index in case there is a type mismatch. 674 // We know that the count starts at zero and does not overflow. 675 // We are using Zext because it should be less expensive. 676 if (ExitCount->getType() != IdxTy) 677 ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); 678 679 // This is the original scalar-loop preheader. 680 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 681 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 682 assert(ExitBlock && "Must have an exit block"); 683 684 // The loop index does not have to start at Zero. It starts with this value. 685 Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock); 686 687 assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); 688 assert(BypassBlock && "Invalid loop structure"); 689 690 BasicBlock *VectorPH = 691 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 692 BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), 693 "vector.body"); 694 695 BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), 696 "middle.block"); 697 BasicBlock *ScalarPH = 698 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), 699 "scalar.preheader"); 700 // Find the induction variable. 701 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 702 703 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 704 // inside the loop. 705 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 706 707 // Generate the induction variable. 708 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 709 Constant *Step = ConstantInt::get(IdxTy, VF); 710 711 // Expand the trip count and place the new instructions in the preheader. 712 // Notice that the pre-header does not change, only the loop body. 713 SCEVExpander Exp(*SE, "induction"); 714 Instruction *Loc = BypassBlock->getTerminator(); 715 716 // Count holds the overall loop count (N). 717 Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); 718 719 // Add the start index to the loop count to get the new end index. 720 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 721 722 // Now we need to generate the expression for N - (N % VF), which is 723 // the part that the vectorized body will execute. 724 Constant *CIVF = ConstantInt::get(IdxTy, VF); 725 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 726 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 727 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 728 "end.idx.rnd.down", Loc); 729 730 // Now, compare the new count to zero. If it is zero, jump to the scalar part. 731 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 732 IdxEndRoundDown, 733 StartIdx, 734 "cmp.zero", Loc); 735 736 LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 737 Legal->getRuntimePointerCheck(); 738 Value *MemoryRuntimeCheck = 0; 739 if (PtrRtCheck->Need) { 740 unsigned NumPointers = PtrRtCheck->Pointers.size(); 741 SmallVector<Value* , 2> Starts; 742 SmallVector<Value* , 2> Ends; 743 744 // Use this type for pointer arithmetic. 745 Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); 746 747 for (unsigned i=0; i < NumPointers; ++i) { 748 Value *Ptr = PtrRtCheck->Pointers[i]; 749 const SCEV *Sc = SE->getSCEV(Ptr); 750 751 if (SE->isLoopInvariant(Sc, OrigLoop)) { 752 DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" << 753 *Ptr <<"\n"); 754 Starts.push_back(Ptr); 755 Ends.push_back(Ptr); 756 } else { 757 DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); 758 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 759 Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc); 760 const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 761 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 762 assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range."); 763 Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc); 764 Starts.push_back(Start); 765 Ends.push_back(End); 766 } 767 } 768 769 for (unsigned i=0; i < NumPointers; ++i) { 770 for (unsigned j=i+1; j < NumPointers; ++j) { 771 Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 772 Starts[0], Ends[1], "bound0", Loc); 773 Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, 774 Starts[1], Ends[0], "bound1", Loc); 775 Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, 776 "found.conflict", Loc); 777 if (MemoryRuntimeCheck) { 778 MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, 779 MemoryRuntimeCheck, 780 IsConflict, 781 "conflict.rdx", Loc); 782 } else { 783 MemoryRuntimeCheck = IsConflict; 784 } 785 } 786 } 787 }// end of need-runtime-check code. 788 789 // If we are using memory runtime checks, include them in. 790 if (MemoryRuntimeCheck) { 791 Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, 792 "CntOrMem", Loc); 793 } 794 795 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 796 // Remove the old terminator. 797 Loc->eraseFromParent(); 798 799 // We are going to resume the execution of the scalar loop. 800 // This PHI decides on what number to start. If we come from the 801 // vector loop then we need to start with the end index minus the 802 // index modulo VF. If we come from a bypass edge then we need to start 803 // from the real start. 804 PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx", 805 MiddleBlock->getTerminator()); 806 ResumeIndex->addIncoming(StartIdx, BypassBlock); 807 ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 808 809 // Add a check in the middle block to see if we have completed 810 // all of the iterations in the first vector loop. 811 // If (N - N%VF) == N, then we *don't* need to run the remainder. 812 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 813 ResumeIndex, "cmp.n", 814 MiddleBlock->getTerminator()); 815 816 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 817 // Remove the old terminator. 818 MiddleBlock->getTerminator()->eraseFromParent(); 819 820 // Create i+1 and fill the PHINode. 821 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 822 Induction->addIncoming(StartIdx, VectorPH); 823 Induction->addIncoming(NextIdx, VecBody); 824 // Create the compare. 825 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 826 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 827 828 // Now we have two terminators. Remove the old one from the block. 829 VecBody->getTerminator()->eraseFromParent(); 830 831 // Fix the scalar body iteration count. 832 unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); 833 OldInduction->setIncomingValue(BlockIdx, ResumeIndex); 834 835 // Get ready to start creating new instructions into the vectorized body. 836 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 837 838 // Register the new loop. 839 Loop* Lp = new Loop(); 840 LPM->insertLoop(Lp, OrigLoop->getParentLoop()); 841 842 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 843 844 Loop *ParentLoop = OrigLoop->getParentLoop(); 845 if (ParentLoop) { 846 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 847 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 848 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 849 } 850 851 // Save the state. 852 LoopVectorPreHeader = VectorPH; 853 LoopScalarPreHeader = ScalarPH; 854 LoopMiddleBlock = MiddleBlock; 855 LoopExitBlock = ExitBlock; 856 LoopVectorBody = VecBody; 857 LoopScalarBody = OldBasicBlock; 858 LoopBypassBlock = BypassBlock; 859 } 860 861 /// This function returns the identity element (or neutral element) for 862 /// the operation K. 863 static unsigned 864 getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 865 switch (K) { 866 case LoopVectorizationLegality::IntegerXor: 867 case LoopVectorizationLegality::IntegerAdd: 868 case LoopVectorizationLegality::IntegerOr: 869 // Adding, Xoring, Oring zero to a number does not change it. 870 return 0; 871 case LoopVectorizationLegality::IntegerMult: 872 // Multiplying a number by 1 does not change it. 873 return 1; 874 case LoopVectorizationLegality::IntegerAnd: 875 // AND-ing a number with an all-1 value does not change it. 876 return -1; 877 default: 878 llvm_unreachable("Unknown reduction kind"); 879 } 880 } 881 882 void 883 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 884 //===------------------------------------------------===// 885 // 886 // Notice: any optimization or new instruction that go 887 // into the code below should be also be implemented in 888 // the cost-model. 889 // 890 //===------------------------------------------------===// 891 typedef SmallVector<PHINode*, 4> PhiVector; 892 BasicBlock &BB = *OrigLoop->getHeader(); 893 Constant *Zero = ConstantInt::get( 894 IntegerType::getInt32Ty(BB.getContext()), 0); 895 896 // In order to support reduction variables we need to be able to vectorize 897 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 898 // steages. First, we create a new vector PHI node with no incoming edges. 899 // We use this value when we vectorize all of the instructions that use the 900 // PHI. Next, after all of the instructions in the block are complete we 901 // add the new incoming edges to the PHI. At this point all of the 902 // instructions in the basic block are vectorized, so we can use them to 903 // construct the PHI. 904 PhiVector PHIsToFix; 905 906 // For each instruction in the old loop. 907 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 908 Instruction *Inst = it; 909 910 switch (Inst->getOpcode()) { 911 case Instruction::Br: 912 // Nothing to do for PHIs and BR, since we already took care of the 913 // loop control flow instructions. 914 continue; 915 case Instruction::PHI:{ 916 PHINode* P = cast<PHINode>(Inst); 917 // Special handling for the induction var. 918 if (OldInduction == Inst) 919 continue; 920 // This is phase one of vectorizing PHIs. 921 // This has to be a reduction variable. 922 assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); 923 Type *VecTy = VectorType::get(Inst->getType(), VF); 924 WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); 925 PHIsToFix.push_back(P); 926 continue; 927 } 928 case Instruction::Add: 929 case Instruction::FAdd: 930 case Instruction::Sub: 931 case Instruction::FSub: 932 case Instruction::Mul: 933 case Instruction::FMul: 934 case Instruction::UDiv: 935 case Instruction::SDiv: 936 case Instruction::FDiv: 937 case Instruction::URem: 938 case Instruction::SRem: 939 case Instruction::FRem: 940 case Instruction::Shl: 941 case Instruction::LShr: 942 case Instruction::AShr: 943 case Instruction::And: 944 case Instruction::Or: 945 case Instruction::Xor: { 946 // Just widen binops. 947 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 948 Value *A = getVectorValue(Inst->getOperand(0)); 949 Value *B = getVectorValue(Inst->getOperand(1)); 950 951 // Use this vector value for all users of the original instruction. 952 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 953 WidenMap[Inst] = V; 954 955 // Update the NSW, NUW and Exact flags. 956 BinaryOperator *VecOp = cast<BinaryOperator>(V); 957 if (isa<OverflowingBinaryOperator>(BinOp)) { 958 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 959 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 960 } 961 if (isa<PossiblyExactOperator>(VecOp)) 962 VecOp->setIsExact(BinOp->isExact()); 963 break; 964 } 965 case Instruction::Select: { 966 // Widen selects. 967 // If the selector is loop invariant we can create a select 968 // instruction with a scalar condition. Otherwise, use vector-select. 969 Value *Cond = Inst->getOperand(0); 970 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 971 972 // The condition can be loop invariant but still defined inside the 973 // loop. This means that we can't just use the original 'cond' value. 974 // We have to take the 'vectorized' value and pick the first lane. 975 // Instcombine will make this a no-op. 976 Cond = getVectorValue(Cond); 977 if (InvariantCond) 978 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 979 980 Value *Op0 = getVectorValue(Inst->getOperand(1)); 981 Value *Op1 = getVectorValue(Inst->getOperand(2)); 982 WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); 983 break; 984 } 985 986 case Instruction::ICmp: 987 case Instruction::FCmp: { 988 // Widen compares. Generate vector compares. 989 bool FCmp = (Inst->getOpcode() == Instruction::FCmp); 990 CmpInst *Cmp = dyn_cast<CmpInst>(Inst); 991 Value *A = getVectorValue(Inst->getOperand(0)); 992 Value *B = getVectorValue(Inst->getOperand(1)); 993 if (FCmp) 994 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 995 else 996 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 997 break; 998 } 999 1000 case Instruction::Store: { 1001 // Attempt to issue a wide store. 1002 StoreInst *SI = dyn_cast<StoreInst>(Inst); 1003 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 1004 Value *Ptr = SI->getPointerOperand(); 1005 unsigned Alignment = SI->getAlignment(); 1006 1007 assert(!Legal->isUniform(Ptr) && 1008 "We do not allow storing to uniform addresses"); 1009 1010 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1011 1012 // This store does not use GEPs. 1013 if (!Legal->isConsecutiveGep(Gep)) { 1014 scalarizeInstruction(Inst); 1015 break; 1016 } 1017 1018 // The last index does not have to be the induction. It can be 1019 // consecutive and be a function of the index. For example A[I+1]; 1020 unsigned NumOperands = Gep->getNumOperands(); 1021 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 1022 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1023 1024 // Create the new GEP with the new induction variable. 1025 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1026 Gep2->setOperand(NumOperands - 1, LastIndex); 1027 Ptr = Builder.Insert(Gep2); 1028 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 1029 Value *Val = getVectorValue(SI->getValueOperand()); 1030 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 1031 break; 1032 } 1033 case Instruction::Load: { 1034 // Attempt to issue a wide load. 1035 LoadInst *LI = dyn_cast<LoadInst>(Inst); 1036 Type *RetTy = VectorType::get(LI->getType(), VF); 1037 Value *Ptr = LI->getPointerOperand(); 1038 unsigned Alignment = LI->getAlignment(); 1039 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1040 1041 // If we don't have a gep, or that the pointer is loop invariant, 1042 // scalarize the load. 1043 if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) { 1044 scalarizeInstruction(Inst); 1045 break; 1046 } 1047 1048 // The last index does not have to be the induction. It can be 1049 // consecutive and be a function of the index. For example A[I+1]; 1050 unsigned NumOperands = Gep->getNumOperands(); 1051 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 1052 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 1053 1054 // Create the new GEP with the new induction variable. 1055 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1056 Gep2->setOperand(NumOperands - 1, LastIndex); 1057 Ptr = Builder.Insert(Gep2); 1058 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 1059 LI = Builder.CreateLoad(Ptr); 1060 LI->setAlignment(Alignment); 1061 // Use this vector value for all users of the load. 1062 WidenMap[Inst] = LI; 1063 break; 1064 } 1065 case Instruction::ZExt: 1066 case Instruction::SExt: 1067 case Instruction::FPToUI: 1068 case Instruction::FPToSI: 1069 case Instruction::FPExt: 1070 case Instruction::PtrToInt: 1071 case Instruction::IntToPtr: 1072 case Instruction::SIToFP: 1073 case Instruction::UIToFP: 1074 case Instruction::Trunc: 1075 case Instruction::FPTrunc: 1076 case Instruction::BitCast: { 1077 /// Vectorize bitcasts. 1078 CastInst *CI = dyn_cast<CastInst>(Inst); 1079 Value *A = getVectorValue(Inst->getOperand(0)); 1080 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 1081 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 1082 break; 1083 } 1084 1085 default: 1086 /// All other instructions are unsupported. Scalarize them. 1087 scalarizeInstruction(Inst); 1088 break; 1089 }// end of switch. 1090 }// end of for_each instr. 1091 1092 // At this point every instruction in the original loop is widended to 1093 // a vector form. We are almost done. Now, we need to fix the PHI nodes 1094 // that we vectorized. The PHI nodes are currently empty because we did 1095 // not want to introduce cycles. Notice that the remaining PHI nodes 1096 // that we need to fix are reduction variables. 1097 1098 // Create the 'reduced' values for each of the induction vars. 1099 // The reduced values are the vector values that we scalarize and combine 1100 // after the loop is finished. 1101 for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); 1102 it != e; ++it) { 1103 PHINode *RdxPhi = *it; 1104 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 1105 assert(RdxPhi && "Unable to recover vectorized PHI"); 1106 1107 // Find the reduction variable descriptor. 1108 assert(Legal->getReductionVars()->count(RdxPhi) && 1109 "Unable to find the reduction variable"); 1110 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1111 (*Legal->getReductionVars())[RdxPhi]; 1112 1113 // We need to generate a reduction vector from the incoming scalar. 1114 // To do so, we need to generate the 'identity' vector and overide 1115 // one of the elements with the incoming scalar reduction. We need 1116 // to do it in the vector-loop preheader. 1117 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 1118 1119 // This is the vector-clone of the value that leaves the loop. 1120 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1121 Type *VecTy = VectorExit->getType(); 1122 1123 // Find the reduction identity variable. Zero for addition, or, xor, 1124 // one for multiplication, -1 for And. 1125 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 1126 VecTy->getScalarType()); 1127 1128 // This vector is the Identity vector where the first element is the 1129 // incoming scalar reduction. 1130 Value *VectorStart = Builder.CreateInsertElement(Identity, 1131 RdxDesc.StartValue, Zero); 1132 1133 1134 // Fix the vector-loop phi. 1135 // We created the induction variable so we know that the 1136 // preheader is the first entry. 1137 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1138 1139 // Reductions do not have to start at zero. They can start with 1140 // any loop invariant values. 1141 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 1142 unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1143 Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); 1144 VecRdxPhi->addIncoming(Val, LoopVectorBody); 1145 1146 // Before each round, move the insertion point right between 1147 // the PHIs and the values we are going to write. 1148 // This allows us to write both PHINodes and the extractelement 1149 // instructions. 1150 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1151 1152 // This PHINode contains the vectorized reduction variable, or 1153 // the initial value vector, if we bypass the vector loop. 1154 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1155 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 1156 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 1157 1158 // Extract the first scalar. 1159 Value *Scalar0 = 1160 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); 1161 // Extract and reduce the remaining vector elements. 1162 for (unsigned i=1; i < VF; ++i) { 1163 Value *Scalar1 = 1164 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); 1165 switch (RdxDesc.Kind) { 1166 case LoopVectorizationLegality::IntegerAdd: 1167 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); 1168 break; 1169 case LoopVectorizationLegality::IntegerMult: 1170 Scalar0 = Builder.CreateMul(Scalar0, Scalar1); 1171 break; 1172 case LoopVectorizationLegality::IntegerOr: 1173 Scalar0 = Builder.CreateOr(Scalar0, Scalar1); 1174 break; 1175 case LoopVectorizationLegality::IntegerAnd: 1176 Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); 1177 break; 1178 case LoopVectorizationLegality::IntegerXor: 1179 Scalar0 = Builder.CreateXor(Scalar0, Scalar1); 1180 break; 1181 default: 1182 llvm_unreachable("Unknown reduction operation"); 1183 } 1184 } 1185 1186 // Now, we need to fix the users of the reduction variable 1187 // inside and outside of the scalar remainder loop. 1188 // We know that the loop is in LCSSA form. We need to update the 1189 // PHI nodes in the exit blocks. 1190 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1191 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1192 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1193 if (!LCSSAPhi) continue; 1194 1195 // All PHINodes need to have a single entry edge, or two if 1196 // we already fixed them. 1197 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1198 1199 // We found our reduction value exit-PHI. Update it with the 1200 // incoming bypass edge. 1201 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1202 // Add an edge coming from the bypass. 1203 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1204 break; 1205 } 1206 }// end of the LCSSA phi scan. 1207 1208 // Fix the scalar loop reduction variable with the incoming reduction sum 1209 // from the vector body and from the backedge value. 1210 int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1211 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. 1212 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1213 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1214 }// end of for each redux variable. 1215 } 1216 1217 void SingleBlockLoopVectorizer::updateAnalysis() { 1218 // The original basic block. 1219 SE->forgetLoop(OrigLoop); 1220 1221 // Update the dominator tree information. 1222 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1223 "Entry does not dominate exit."); 1224 1225 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1226 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1227 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1228 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1229 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1230 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1231 1232 DEBUG(DT->verifyAnalysis()); 1233 } 1234 1235 bool LoopVectorizationLegality::canVectorize() { 1236 if (!TheLoop->getLoopPreheader()) { 1237 assert(false && "No preheader!!"); 1238 DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); 1239 return false; 1240 } 1241 1242 // We can only vectorize single basic block loops. 1243 unsigned NumBlocks = TheLoop->getNumBlocks(); 1244 if (NumBlocks != 1) { 1245 DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); 1246 return false; 1247 } 1248 1249 // We need to have a loop header. 1250 BasicBlock *BB = TheLoop->getHeader(); 1251 DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); 1252 1253 // ScalarEvolution needs to be able to find the exit count. 1254 const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); 1255 if (ExitCount == SE->getCouldNotCompute()) { 1256 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1257 return false; 1258 } 1259 1260 // Do not loop-vectorize loops with a tiny trip count. 1261 unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); 1262 if (TC > 0u && TC < TinyTripCountThreshold) { 1263 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1264 "This loop is not worth vectorizing.\n"); 1265 return false; 1266 } 1267 1268 // Go over each instruction and look at memory deps. 1269 if (!canVectorizeBlock(*BB)) { 1270 DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); 1271 return false; 1272 } 1273 1274 DEBUG(dbgs() << "LV: We can vectorize this loop" << 1275 (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 1276 <<"!\n"); 1277 1278 // Okay! We can vectorize. At this point we don't have any other mem analysis 1279 // which may limit our maximum vectorization factor, so just return true with 1280 // no restrictions. 1281 return true; 1282 } 1283 1284 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { 1285 // Scan the instructions in the block and look for hazards. 1286 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1287 Instruction *I = it; 1288 1289 PHINode *Phi = dyn_cast<PHINode>(I); 1290 if (Phi) { 1291 // This should not happen because the loop should be normalized. 1292 if (Phi->getNumIncomingValues() != 2) { 1293 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1294 return false; 1295 } 1296 // We only look at integer phi nodes. 1297 if (!Phi->getType()->isIntegerTy()) { 1298 DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); 1299 return false; 1300 } 1301 1302 if (isInductionVariable(Phi)) { 1303 if (Induction) { 1304 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1305 return false; 1306 } 1307 DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); 1308 Induction = Phi; 1309 continue; 1310 } 1311 if (AddReductionVar(Phi, IntegerAdd)) { 1312 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1313 continue; 1314 } 1315 if (AddReductionVar(Phi, IntegerMult)) { 1316 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1317 continue; 1318 } 1319 if (AddReductionVar(Phi, IntegerOr)) { 1320 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1321 continue; 1322 } 1323 if (AddReductionVar(Phi, IntegerAnd)) { 1324 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1325 continue; 1326 } 1327 if (AddReductionVar(Phi, IntegerXor)) { 1328 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1329 continue; 1330 } 1331 1332 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1333 return false; 1334 }// end of PHI handling 1335 1336 // We still don't handle functions. 1337 CallInst *CI = dyn_cast<CallInst>(I); 1338 if (CI) { 1339 DEBUG(dbgs() << "LV: Found a call site.\n"); 1340 return false; 1341 } 1342 1343 // We do not re-vectorize vectors. 1344 if (!VectorType::isValidElementType(I->getType()) && 1345 !I->getType()->isVoidTy()) { 1346 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1347 return false; 1348 } 1349 1350 // Reduction instructions are allowed to have exit users. 1351 // All other instructions must not have external users. 1352 if (!AllowedExit.count(I)) 1353 //Check that all of the users of the loop are inside the BB. 1354 for (Value::use_iterator it = I->use_begin(), e = I->use_end(); 1355 it != e; ++it) { 1356 Instruction *U = cast<Instruction>(*it); 1357 // This user may be a reduction exit value. 1358 BasicBlock *Parent = U->getParent(); 1359 if (Parent != &BB) { 1360 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1361 return false; 1362 } 1363 } 1364 } // next instr. 1365 1366 if (!Induction) { 1367 DEBUG(dbgs() << "LV: Did not find an induction var.\n"); 1368 return false; 1369 } 1370 1371 // Don't vectorize if the memory dependencies do not allow vectorization. 1372 if (!canVectorizeMemory(BB)) 1373 return false; 1374 1375 // We now know that the loop is vectorizable! 1376 // Collect variables that will remain uniform after vectorization. 1377 std::vector<Value*> Worklist; 1378 1379 // Start with the conditional branch and walk up the block. 1380 Worklist.push_back(BB.getTerminator()->getOperand(0)); 1381 1382 while (Worklist.size()) { 1383 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1384 Worklist.pop_back(); 1385 // Look at instructions inside this block. 1386 if (!I) continue; 1387 if (I->getParent() != &BB) continue; 1388 1389 // Stop when reaching PHI nodes. 1390 if (isa<PHINode>(I)) { 1391 assert(I == Induction && "Found a uniform PHI that is not the induction"); 1392 break; 1393 } 1394 1395 // This is a known uniform. 1396 Uniforms.insert(I); 1397 1398 // Insert all operands. 1399 for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { 1400 Worklist.push_back(I->getOperand(i)); 1401 } 1402 } 1403 1404 return true; 1405 } 1406 1407 bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { 1408 typedef SmallVector<Value*, 16> ValueVector; 1409 typedef SmallPtrSet<Value*, 16> ValueSet; 1410 // Holds the Load and Store *instructions*. 1411 ValueVector Loads; 1412 ValueVector Stores; 1413 PtrRtCheck.Pointers.clear(); 1414 PtrRtCheck.Need = false; 1415 1416 // Scan the BB and collect legal loads and stores. 1417 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1418 Instruction *I = it; 1419 1420 // If this is a load, save it. If this instruction can read from memory 1421 // but is not a load, then we quit. Notice that we don't handle function 1422 // calls that read or write. 1423 if (I->mayReadFromMemory()) { 1424 LoadInst *Ld = dyn_cast<LoadInst>(I); 1425 if (!Ld) return false; 1426 if (!Ld->isSimple()) { 1427 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1428 return false; 1429 } 1430 Loads.push_back(Ld); 1431 continue; 1432 } 1433 1434 // Save store instructions. Abort if other instructions write to memory. 1435 if (I->mayWriteToMemory()) { 1436 StoreInst *St = dyn_cast<StoreInst>(I); 1437 if (!St) return false; 1438 if (!St->isSimple()) { 1439 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1440 return false; 1441 } 1442 Stores.push_back(St); 1443 } 1444 } // next instr. 1445 1446 // Now we have two lists that hold the loads and the stores. 1447 // Next, we find the pointers that they use. 1448 1449 // Check if we see any stores. If there are no stores, then we don't 1450 // care if the pointers are *restrict*. 1451 if (!Stores.size()) { 1452 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1453 return true; 1454 } 1455 1456 // Holds the read and read-write *pointers* that we find. 1457 ValueVector Reads; 1458 ValueVector ReadWrites; 1459 1460 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1461 // multiple times on the same object. If the ptr is accessed twice, once 1462 // for read and once for write, it will only appear once (on the write 1463 // list). This is okay, since we are going to check for conflicts between 1464 // writes and between reads and writes, but not between reads and reads. 1465 ValueSet Seen; 1466 1467 ValueVector::iterator I, IE; 1468 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1469 StoreInst *ST = dyn_cast<StoreInst>(*I); 1470 assert(ST && "Bad StoreInst"); 1471 Value* Ptr = ST->getPointerOperand(); 1472 1473 if (isUniform(Ptr)) { 1474 DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 1475 return false; 1476 } 1477 1478 // If we did *not* see this pointer before, insert it to 1479 // the read-write list. At this phase it is only a 'write' list. 1480 if (Seen.insert(Ptr)) 1481 ReadWrites.push_back(Ptr); 1482 } 1483 1484 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1485 LoadInst *LD = dyn_cast<LoadInst>(*I); 1486 assert(LD && "Bad LoadInst"); 1487 Value* Ptr = LD->getPointerOperand(); 1488 // If we did *not* see this pointer before, insert it to the 1489 // read list. If we *did* see it before, then it is already in 1490 // the read-write list. This allows us to vectorize expressions 1491 // such as A[i] += x; Because the address of A[i] is a read-write 1492 // pointer. This only works if the index of A[i] is consecutive. 1493 // If the address of i is unknown (for example A[B[i]]) then we may 1494 // read a few words, modify, and write a few words, and some of the 1495 // words may be written to the same address. 1496 if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr)) 1497 Reads.push_back(Ptr); 1498 } 1499 1500 // If we write (or read-write) to a single destination and there are no 1501 // other reads in this loop then is it safe to vectorize. 1502 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1503 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1504 return true; 1505 } 1506 1507 // Find pointers with computable bounds. We are going to use this information 1508 // to place a runtime bound check. 1509 bool RT = true; 1510 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) 1511 if (hasComputableBounds(*I)) { 1512 PtrRtCheck.Pointers.push_back(*I); 1513 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1514 } else { 1515 RT = false; 1516 break; 1517 } 1518 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) 1519 if (hasComputableBounds(*I)) { 1520 PtrRtCheck.Pointers.push_back(*I); 1521 DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); 1522 } else { 1523 RT = false; 1524 break; 1525 } 1526 1527 // Check that we did not collect too many pointers or found a 1528 // unsizeable pointer. 1529 if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { 1530 PtrRtCheck.Pointers.clear(); 1531 RT = false; 1532 } 1533 1534 PtrRtCheck.Need = RT; 1535 1536 if (RT) { 1537 DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 1538 } 1539 1540 // Now that the pointers are in two lists (Reads and ReadWrites), we 1541 // can check that there are no conflicts between each of the writes and 1542 // between the writes to the reads. 1543 ValueSet WriteObjects; 1544 ValueVector TempObjects; 1545 1546 // Check that the read-writes do not conflict with other read-write 1547 // pointers. 1548 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1549 GetUnderlyingObjects(*I, TempObjects, DL); 1550 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1551 it != e; ++it) { 1552 if (!isIdentifiedObject(*it)) { 1553 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1554 return RT; 1555 } 1556 if (!WriteObjects.insert(*it)) { 1557 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1558 << **it <<"\n"); 1559 return RT; 1560 } 1561 } 1562 TempObjects.clear(); 1563 } 1564 1565 /// Check that the reads don't conflict with the read-writes. 1566 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1567 GetUnderlyingObjects(*I, TempObjects, DL); 1568 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1569 it != e; ++it) { 1570 if (!isIdentifiedObject(*it)) { 1571 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1572 return RT; 1573 } 1574 if (WriteObjects.count(*it)) { 1575 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1576 << **it <<"\n"); 1577 return RT; 1578 } 1579 } 1580 TempObjects.clear(); 1581 } 1582 1583 // It is safe to vectorize and we don't need any runtime checks. 1584 DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); 1585 PtrRtCheck.Pointers.clear(); 1586 PtrRtCheck.Need = false; 1587 return true; 1588 } 1589 1590 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1591 ReductionKind Kind) { 1592 if (Phi->getNumIncomingValues() != 2) 1593 return false; 1594 1595 // Find the possible incoming reduction variable. 1596 BasicBlock *BB = Phi->getParent(); 1597 int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); 1598 int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. 1599 Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); 1600 1601 // ExitInstruction is the single value which is used outside the loop. 1602 // We only allow for a single reduction value to be used outside the loop. 1603 // This includes users of the reduction, variables (which form a cycle 1604 // which ends in the phi node). 1605 Instruction *ExitInstruction = 0; 1606 1607 // Iter is our iterator. We start with the PHI node and scan for all of the 1608 // users of this instruction. All users must be instructions which can be 1609 // used as reduction variables (such as ADD). We may have a single 1610 // out-of-block user. They cycle must end with the original PHI. 1611 // Also, we can't have multiple block-local users. 1612 Instruction *Iter = Phi; 1613 while (true) { 1614 // Any reduction instr must be of one of the allowed kinds. 1615 if (!isReductionInstr(Iter, Kind)) 1616 return false; 1617 1618 // Did we found a user inside this block ? 1619 bool FoundInBlockUser = false; 1620 // Did we reach the initial PHI node ? 1621 bool FoundStartPHI = false; 1622 1623 // If the instruction has no users then this is a broken 1624 // chain and can't be a reduction variable. 1625 if (Iter->use_empty()) 1626 return false; 1627 1628 // For each of the *users* of iter. 1629 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1630 it != e; ++it) { 1631 Instruction *U = cast<Instruction>(*it); 1632 // We already know that the PHI is a user. 1633 if (U == Phi) { 1634 FoundStartPHI = true; 1635 continue; 1636 } 1637 // Check if we found the exit user. 1638 BasicBlock *Parent = U->getParent(); 1639 if (Parent != BB) { 1640 // We must have a single exit instruction. 1641 if (ExitInstruction != 0) 1642 return false; 1643 ExitInstruction = Iter; 1644 } 1645 // We can't have multiple inside users. 1646 if (FoundInBlockUser) 1647 return false; 1648 FoundInBlockUser = true; 1649 Iter = U; 1650 } 1651 1652 // We found a reduction var if we have reached the original 1653 // phi node and we only have a single instruction with out-of-loop 1654 // users. 1655 if (FoundStartPHI && ExitInstruction) { 1656 // This instruction is allowed to have out-of-loop users. 1657 AllowedExit.insert(ExitInstruction); 1658 1659 // Save the description of this reduction variable. 1660 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1661 Reductions[Phi] = RD; 1662 return true; 1663 } 1664 } 1665 } 1666 1667 bool 1668 LoopVectorizationLegality::isReductionInstr(Instruction *I, 1669 ReductionKind Kind) { 1670 switch (I->getOpcode()) { 1671 default: 1672 return false; 1673 case Instruction::PHI: 1674 // possibly. 1675 return true; 1676 case Instruction::Add: 1677 case Instruction::Sub: 1678 return Kind == IntegerAdd; 1679 case Instruction::Mul: 1680 case Instruction::UDiv: 1681 case Instruction::SDiv: 1682 return Kind == IntegerMult; 1683 case Instruction::And: 1684 return Kind == IntegerAnd; 1685 case Instruction::Or: 1686 return Kind == IntegerOr; 1687 case Instruction::Xor: 1688 return Kind == IntegerXor; 1689 } 1690 } 1691 1692 bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1693 // Check that the PHI is consecutive and starts at zero. 1694 const SCEV *PhiScev = SE->getSCEV(Phi); 1695 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1696 if (!AR) { 1697 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1698 return false; 1699 } 1700 const SCEV *Step = AR->getStepRecurrence(*SE); 1701 1702 if (!Step->isOne()) { 1703 DEBUG(dbgs() << "LV: PHI stride does not equal one.\n"); 1704 return false; 1705 } 1706 return true; 1707 } 1708 1709 bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { 1710 const SCEV *PhiScev = SE->getSCEV(Ptr); 1711 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1712 if (!AR) 1713 return false; 1714 1715 return AR->isAffine(); 1716 } 1717 1718 unsigned 1719 LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { 1720 if (!VTTI) { 1721 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 1722 return 1; 1723 } 1724 1725 float Cost = expectedCost(1); 1726 unsigned Width = 1; 1727 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 1728 for (unsigned i=2; i <= VF; i*=2) { 1729 // Notice that the vector loop needs to be executed less times, so 1730 // we need to divide the cost of the vector loops by the width of 1731 // the vector elements. 1732 float VectorCost = expectedCost(i) / (float)i; 1733 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 1734 (int)VectorCost << ".\n"); 1735 if (VectorCost < Cost) { 1736 Cost = VectorCost; 1737 Width = i; 1738 } 1739 } 1740 1741 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 1742 return Width; 1743 } 1744 1745 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 1746 // We can only estimate the cost of single basic block loops. 1747 assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); 1748 1749 BasicBlock *BB = TheLoop->getHeader(); 1750 unsigned Cost = 0; 1751 1752 // For each instruction in the old loop. 1753 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1754 Instruction *Inst = it; 1755 unsigned C = getInstructionCost(Inst, VF); 1756 Cost += C; 1757 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << 1758 " For instruction: "<< *Inst << "\n"); 1759 } 1760 1761 return Cost; 1762 } 1763 1764 unsigned 1765 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 1766 assert(VTTI && "Invalid vector target transformation info"); 1767 1768 // If we know that this instruction will remain uniform, check the cost of 1769 // the scalar version. 1770 if (Legal->isUniformAfterVectorization(I)) 1771 VF = 1; 1772 1773 Type *RetTy = I->getType(); 1774 Type *VectorTy = ToVectorTy(RetTy, VF); 1775 1776 1777 // TODO: We need to estimate the cost of intrinsic calls. 1778 switch (I->getOpcode()) { 1779 case Instruction::GetElementPtr: 1780 // We mark this instruction as zero-cost because scalar GEPs are usually 1781 // lowered to the intruction addressing mode. At the moment we don't 1782 // generate vector geps. 1783 return 0; 1784 case Instruction::Br: { 1785 return VTTI->getCFInstrCost(I->getOpcode()); 1786 } 1787 case Instruction::PHI: 1788 return 0; 1789 case Instruction::Add: 1790 case Instruction::FAdd: 1791 case Instruction::Sub: 1792 case Instruction::FSub: 1793 case Instruction::Mul: 1794 case Instruction::FMul: 1795 case Instruction::UDiv: 1796 case Instruction::SDiv: 1797 case Instruction::FDiv: 1798 case Instruction::URem: 1799 case Instruction::SRem: 1800 case Instruction::FRem: 1801 case Instruction::Shl: 1802 case Instruction::LShr: 1803 case Instruction::AShr: 1804 case Instruction::And: 1805 case Instruction::Or: 1806 case Instruction::Xor: { 1807 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 1808 } 1809 case Instruction::Select: { 1810 SelectInst *SI = cast<SelectInst>(I); 1811 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 1812 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 1813 Type *CondTy = SI->getCondition()->getType(); 1814 if (ScalarCond) 1815 CondTy = VectorType::get(CondTy, VF); 1816 1817 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 1818 } 1819 case Instruction::ICmp: 1820 case Instruction::FCmp: { 1821 Type *ValTy = I->getOperand(0)->getType(); 1822 VectorTy = ToVectorTy(ValTy, VF); 1823 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 1824 } 1825 case Instruction::Store: { 1826 StoreInst *SI = cast<StoreInst>(I); 1827 Type *ValTy = SI->getValueOperand()->getType(); 1828 VectorTy = ToVectorTy(ValTy, VF); 1829 1830 if (VF == 1) 1831 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 1832 SI->getAlignment(), SI->getPointerAddressSpace()); 1833 1834 // Scalarized stores. 1835 if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { 1836 unsigned Cost = 0; 1837 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 1838 ValTy); 1839 // The cost of extracting from the value vector. 1840 Cost += VF * (ExtCost); 1841 // The cost of the scalar stores. 1842 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 1843 ValTy->getScalarType(), 1844 SI->getAlignment(), 1845 SI->getPointerAddressSpace()); 1846 return Cost; 1847 } 1848 1849 // Wide stores. 1850 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 1851 SI->getPointerAddressSpace()); 1852 } 1853 case Instruction::Load: { 1854 LoadInst *LI = cast<LoadInst>(I); 1855 1856 if (VF == 1) 1857 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 1858 LI->getAlignment(), 1859 LI->getPointerAddressSpace()); 1860 1861 // Scalarized loads. 1862 if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { 1863 unsigned Cost = 0; 1864 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 1865 // The cost of inserting the loaded value into the result vector. 1866 Cost += VF * (InCost); 1867 // The cost of the scalar stores. 1868 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 1869 RetTy->getScalarType(), 1870 LI->getAlignment(), 1871 LI->getPointerAddressSpace()); 1872 return Cost; 1873 } 1874 1875 // Wide loads. 1876 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 1877 LI->getPointerAddressSpace()); 1878 } 1879 case Instruction::ZExt: 1880 case Instruction::SExt: 1881 case Instruction::FPToUI: 1882 case Instruction::FPToSI: 1883 case Instruction::FPExt: 1884 case Instruction::PtrToInt: 1885 case Instruction::IntToPtr: 1886 case Instruction::SIToFP: 1887 case Instruction::UIToFP: 1888 case Instruction::Trunc: 1889 case Instruction::FPTrunc: 1890 case Instruction::BitCast: { 1891 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 1892 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 1893 } 1894 default: { 1895 // We are scalarizing the instruction. Return the cost of the scalar 1896 // instruction, plus the cost of insert and extract into vector 1897 // elements, times the vector width. 1898 unsigned Cost = 0; 1899 1900 bool IsVoid = RetTy->isVoidTy(); 1901 1902 unsigned InsCost = (IsVoid ? 0 : 1903 VTTI->getInstrCost(Instruction::InsertElement, 1904 VectorTy)); 1905 1906 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 1907 VectorTy); 1908 1909 // The cost of inserting the results plus extracting each one of the 1910 // operands. 1911 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 1912 1913 // The cost of executing VF copies of the scalar instruction. 1914 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 1915 return Cost; 1916 } 1917 }// end of switch. 1918 } 1919 1920 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 1921 if (Scalar->isVoidTy() || VF == 1) 1922 return Scalar; 1923 return VectorType::get(Scalar, VF); 1924 } 1925 1926 } // namespace 1927 1928 char LoopVectorize::ID = 0; 1929 static const char lv_name[] = "Loop Vectorization"; 1930 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 1931 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 1932 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1933 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 1934 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 1935 1936 namespace llvm { 1937 Pass *createLoopVectorizePass() { 1938 return new LoopVectorize(); 1939 } 1940 } 1941 1942