1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11 // and generates target-independent LLVM-IR. Legalization of the IR is done 12 // in the codegen. However, the vectorizes uses (will use) the codegen 13 // interfaces to generate IR that is likely to result in an optimal binary. 14 // 15 // The loop vectorizer combines consecutive loop iteration into a single 16 // 'wide' iteration. After this transformation the index is incremented 17 // by the SIMD vector width, and not by one. 18 // 19 // This pass has three parts: 20 // 1. The main loop pass that drives the different parts. 21 // 2. LoopVectorizationLegality - A unit that checks for the legality 22 // of the vectorization. 23 // 3. SingleBlockLoopVectorizer - A unit that performs the actual 24 // widening of instructions. 25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 26 // of vectorization. It decides on the optimal vector width, which 27 // can be one, if vectorization is not profitable. 28 //===----------------------------------------------------------------------===// 29 // 30 // The reduction-variable vectorization is based on the paper: 31 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 32 // 33 // Variable uniformity checks are inspired by: 34 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 35 // 36 // Other ideas/concepts are from: 37 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 38 // 39 //===----------------------------------------------------------------------===// 40 #define LV_NAME "loop-vectorize" 41 #define DEBUG_TYPE LV_NAME 42 #include "llvm/Constants.h" 43 #include "llvm/DerivedTypes.h" 44 #include "llvm/Instructions.h" 45 #include "llvm/LLVMContext.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Analysis/LoopPass.h" 48 #include "llvm/Value.h" 49 #include "llvm/Function.h" 50 #include "llvm/Analysis/Verifier.h" 51 #include "llvm/Module.h" 52 #include "llvm/Type.h" 53 #include "llvm/ADT/SmallVector.h" 54 #include "llvm/ADT/StringExtras.h" 55 #include "llvm/Analysis/AliasAnalysis.h" 56 #include "llvm/Analysis/AliasSetTracker.h" 57 #include "llvm/Analysis/ScalarEvolution.h" 58 #include "llvm/Analysis/Dominators.h" 59 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 60 #include "llvm/Analysis/ScalarEvolutionExpander.h" 61 #include "llvm/Analysis/LoopInfo.h" 62 #include "llvm/Analysis/ValueTracking.h" 63 #include "llvm/Transforms/Scalar.h" 64 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 65 #include "llvm/TargetTransformInfo.h" 66 #include "llvm/Support/CommandLine.h" 67 #include "llvm/Support/Debug.h" 68 #include "llvm/Support/raw_ostream.h" 69 #include "llvm/DataLayout.h" 70 #include "llvm/Transforms/Utils/Local.h" 71 #include <algorithm> 72 using namespace llvm; 73 74 static cl::opt<unsigned> 75 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 76 cl::desc("Set the default vectorization width. Zero is autoselect.")); 77 78 /// We don't vectorize loops with a known constant trip count below this number. 79 const unsigned TinyTripCountThreshold = 16; 80 81 namespace { 82 83 // Forward declarations. 84 class LoopVectorizationLegality; 85 class LoopVectorizationCostModel; 86 87 /// SingleBlockLoopVectorizer vectorizes loops which contain only one basic 88 /// block to a specified vectorization factor (VF). 89 /// This class performs the widening of scalars into vectors, or multiple 90 /// scalars. This class also implements the following features: 91 /// * It inserts an epilogue loop for handling loops that don't have iteration 92 /// counts that are known to be a multiple of the vectorization factor. 93 /// * It handles the code generation for reduction variables. 94 /// * Scalarization (implementation using scalars) of un-vectorizable 95 /// instructions. 96 /// SingleBlockLoopVectorizer does not perform any vectorization-legality 97 /// checks, and relies on the caller to check for the different legality 98 /// aspects. The SingleBlockLoopVectorizer relies on the 99 /// LoopVectorizationLegality class to provide information about the induction 100 /// and reduction variables that were found to a given vectorization factor. 101 class SingleBlockLoopVectorizer { 102 public: 103 /// Ctor. 104 SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, 105 DominatorTree *dt, LPPassManager *Lpm, 106 unsigned VecWidth): 107 OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth), 108 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 109 110 // Perform the actual loop widening (vectorization). 111 void vectorize(LoopVectorizationLegality *Legal) { 112 ///Create a new empty loop. Unlink the old loop and connect the new one. 113 createEmptyLoop(Legal); 114 /// Widen each instruction in the old loop to a new one in the new loop. 115 /// Use the Legality module to find the induction and reduction variables. 116 vectorizeLoop(Legal); 117 // register the new loop. 118 updateAnalysis(); 119 } 120 121 private: 122 /// Create an empty loop, based on the loop ranges of the old loop. 123 void createEmptyLoop(LoopVectorizationLegality *Legal); 124 /// Copy and widen the instructions from the old loop. 125 void vectorizeLoop(LoopVectorizationLegality *Legal); 126 /// Insert the new loop to the loop hierarchy and pass manager. 127 void updateAnalysis(); 128 129 /// This instruction is un-vectorizable. Implement it as a sequence 130 /// of scalars. 131 void scalarizeInstruction(Instruction *Instr); 132 133 /// Create a broadcast instruction. This method generates a broadcast 134 /// instruction (shuffle) for loop invariant values and for the induction 135 /// value. If this is the induction variable then we extend it to N, N+1, ... 136 /// this is needed because each iteration in the loop corresponds to a SIMD 137 /// element. 138 Value *getBroadcastInstrs(Value *V); 139 140 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 141 /// for each element in the vector. Starting from zero. 142 Value *getConsecutiveVector(Value* Val); 143 144 /// When we go over instructions in the basic block we rely on previous 145 /// values within the current basic block or on loop invariant values. 146 /// When we widen (vectorize) values we place them in the map. If the values 147 /// are not within the map, they have to be loop invariant, so we simply 148 /// broadcast them into a vector. 149 Value *getVectorValue(Value *V); 150 151 /// Get a uniform vector of constant integers. We use this to get 152 /// vectors of ones and zeros for the reduction code. 153 Constant* getUniformVector(unsigned Val, Type* ScalarTy); 154 155 typedef DenseMap<Value*, Value*> ValueMap; 156 157 /// The original loop. 158 Loop *OrigLoop; 159 // Scev analysis to use. 160 ScalarEvolution *SE; 161 // Loop Info. 162 LoopInfo *LI; 163 // Dominator Tree. 164 DominatorTree *DT; 165 // Loop Pass Manager; 166 LPPassManager *LPM; 167 // The vectorization factor to use. 168 unsigned VF; 169 170 // The builder that we use 171 IRBuilder<> Builder; 172 173 // --- Vectorization state --- 174 175 /// The vector-loop preheader. 176 BasicBlock *LoopVectorPreHeader; 177 /// The scalar-loop preheader. 178 BasicBlock *LoopScalarPreHeader; 179 /// Middle Block between the vector and the scalar. 180 BasicBlock *LoopMiddleBlock; 181 ///The ExitBlock of the scalar loop. 182 BasicBlock *LoopExitBlock; 183 ///The vector loop body. 184 BasicBlock *LoopVectorBody; 185 ///The scalar loop body. 186 BasicBlock *LoopScalarBody; 187 ///The first bypass block. 188 BasicBlock *LoopBypassBlock; 189 190 /// The new Induction variable which was added to the new block. 191 PHINode *Induction; 192 /// The induction variable of the old basic block. 193 PHINode *OldInduction; 194 // Maps scalars to widened vectors. 195 ValueMap WidenMap; 196 }; 197 198 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 199 /// to what vectorization factor. 200 /// This class does not look at the profitability of vectorization, only the 201 /// legality. This class has two main kinds of checks: 202 /// * Memory checks - The code in canVectorizeMemory checks if vectorization 203 /// will change the order of memory accesses in a way that will change the 204 /// correctness of the program. 205 /// * Scalars checks - The code in canVectorizeBlock checks for a number 206 /// of different conditions, such as the availability of a single induction 207 /// variable, that all types are supported and vectorize-able, etc. 208 /// This code reflects the capabilities of SingleBlockLoopVectorizer. 209 /// This class is also used by SingleBlockLoopVectorizer for identifying 210 /// induction variable and the different reduction variables. 211 class LoopVectorizationLegality { 212 public: 213 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): 214 TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } 215 216 /// This represents the kinds of reductions that we support. 217 enum ReductionKind { 218 NoReduction, /// Not a reduction. 219 IntegerAdd, /// Sum of numbers. 220 IntegerMult, /// Product of numbers. 221 IntegerOr, /// Bitwise or logical OR of numbers. 222 IntegerAnd, /// Bitwise or logical AND of numbers. 223 IntegerXor /// Bitwise or logical XOR of numbers. 224 }; 225 226 /// This POD struct holds information about reduction variables. 227 struct ReductionDescriptor { 228 // Default C'tor 229 ReductionDescriptor(): 230 StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} 231 232 // C'tor. 233 ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): 234 StartValue(Start), LoopExitInstr(Exit), Kind(K) {} 235 236 // The starting value of the reduction. 237 // It does not have to be zero! 238 Value *StartValue; 239 // The instruction who's value is used outside the loop. 240 Instruction *LoopExitInstr; 241 // The kind of the reduction. 242 ReductionKind Kind; 243 }; 244 245 /// ReductionList contains the reduction descriptors for all 246 /// of the reductions that were found in the loop. 247 typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 248 249 /// Returns true if it is legal to vectorize this loop. 250 /// This does not mean that it is profitable to vectorize this 251 /// loop, only that it is legal to do so. 252 bool canVectorize(); 253 254 /// Returns the Induction variable. 255 PHINode *getInduction() {return Induction;} 256 257 /// Returns the reduction variables found in the loop. 258 ReductionList *getReductionVars() { return &Reductions; } 259 260 /// Check if the pointer returned by this GEP is consecutive 261 /// when the index is vectorized. This happens when the last 262 /// index of the GEP is consecutive, like the induction variable. 263 /// This check allows us to vectorize A[idx] into a wide load/store. 264 bool isConsecutiveGep(Value *Ptr); 265 266 /// Returns true if this instruction will remain scalar after vectorization. 267 bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} 268 269 private: 270 /// Check if a single basic block loop is vectorizable. 271 /// At this point we know that this is a loop with a constant trip count 272 /// and we only need to check individual instructions. 273 bool canVectorizeBlock(BasicBlock &BB); 274 275 /// When we vectorize loops we may change the order in which 276 /// we read and write from memory. This method checks if it is 277 /// legal to vectorize the code, considering only memory constrains. 278 /// Returns true if BB is vectorizable 279 bool canVectorizeMemory(BasicBlock &BB); 280 281 /// Returns True, if 'Phi' is the kind of reduction variable for type 282 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 283 bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 284 /// Returns true if the instruction I can be a reduction variable of type 285 /// 'Kind'. 286 bool isReductionInstr(Instruction *I, ReductionKind Kind); 287 /// Returns True, if 'Phi' is an induction variable. 288 bool isInductionVariable(PHINode *Phi); 289 290 /// The loop that we evaluate. 291 Loop *TheLoop; 292 /// Scev analysis. 293 ScalarEvolution *SE; 294 /// DataLayout analysis. 295 DataLayout *DL; 296 297 // --- vectorization state --- // 298 299 /// Holds the induction variable. 300 PHINode *Induction; 301 /// Holds the reduction variables. 302 ReductionList Reductions; 303 /// Allowed outside users. This holds the reduction 304 /// vars which can be accessed from outside the loop. 305 SmallPtrSet<Value*, 4> AllowedExit; 306 /// This set holds the variables which are known to be uniform after 307 /// vectorization. 308 SmallPtrSet<Instruction*, 4> Uniforms; 309 }; 310 311 /// LoopVectorizationCostModel - estimates the expected speedups due to 312 /// vectorization. 313 /// In many cases vectorization is not profitable. This can happen because 314 /// of a number of reasons. In this class we mainly attempt to predict 315 /// the expected speedup/slowdowns due to the supported instruction set. 316 /// We use the VectorTargetTransformInfo to query the different backends 317 /// for the cost of different operations. 318 class LoopVectorizationCostModel { 319 public: 320 /// C'tor. 321 LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, 322 LoopVectorizationLegality *Leg, 323 const VectorTargetTransformInfo *Vtti): 324 TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } 325 326 /// Returns the most profitable vectorization factor for the loop that is 327 /// smaller or equal to the VF argument. This method checks every power 328 /// of two up to VF. 329 unsigned findBestVectorizationFactor(unsigned VF = 8); 330 331 private: 332 /// Returns the expected execution cost. The unit of the cost does 333 /// not matter because we use the 'cost' units to compare different 334 /// vector widths. The cost that is returned is *not* normalized by 335 /// the factor width. 336 unsigned expectedCost(unsigned VF); 337 338 /// Returns the execution time cost of an instruction for a given vector 339 /// width. Vector width of one means scalar. 340 unsigned getInstructionCost(Instruction *I, unsigned VF); 341 342 /// A helper function for converting Scalar types to vector types. 343 /// If the incoming type is void, we return void. If the VF is 1, we return 344 /// the scalar type. 345 static Type* ToVectorTy(Type *Scalar, unsigned VF); 346 347 /// The loop that we evaluate. 348 Loop *TheLoop; 349 /// Scev analysis. 350 ScalarEvolution *SE; 351 352 /// Vectorization legality. 353 LoopVectorizationLegality *Legal; 354 /// Vector target information. 355 const VectorTargetTransformInfo *VTTI; 356 }; 357 358 struct LoopVectorize : public LoopPass { 359 static char ID; // Pass identification, replacement for typeid 360 361 LoopVectorize() : LoopPass(ID) { 362 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 363 } 364 365 ScalarEvolution *SE; 366 DataLayout *DL; 367 LoopInfo *LI; 368 TargetTransformInfo *TTI; 369 DominatorTree *DT; 370 371 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 372 // We only vectorize innermost loops. 373 if (!L->empty()) 374 return false; 375 376 SE = &getAnalysis<ScalarEvolution>(); 377 DL = getAnalysisIfAvailable<DataLayout>(); 378 LI = &getAnalysis<LoopInfo>(); 379 TTI = getAnalysisIfAvailable<TargetTransformInfo>(); 380 DT = &getAnalysis<DominatorTree>(); 381 382 DEBUG(dbgs() << "LV: Checking a loop in \"" << 383 L->getHeader()->getParent()->getName() << "\"\n"); 384 385 // Check if it is legal to vectorize the loop. 386 LoopVectorizationLegality LVL(L, SE, DL); 387 if (!LVL.canVectorize()) { 388 DEBUG(dbgs() << "LV: Not vectorizing.\n"); 389 return false; 390 } 391 392 // Select the preffered vectorization factor. 393 unsigned VF = 1; 394 if (VectorizationFactor == 0) { 395 const VectorTargetTransformInfo *VTTI = 0; 396 if (TTI) 397 VTTI = TTI->getVectorTargetTransformInfo(); 398 // Use the cost model. 399 LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); 400 VF = CM.findBestVectorizationFactor(); 401 402 if (VF == 1) { 403 DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 404 return false; 405 } 406 407 } else { 408 // Use the user command flag. 409 VF = VectorizationFactor; 410 } 411 412 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< 413 L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< 414 "\n"); 415 416 // If we decided that it is *legal* to vectorizer the loop then do it. 417 SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF); 418 LB.vectorize(&LVL); 419 420 DEBUG(verifyFunction(*L->getHeader()->getParent())); 421 return true; 422 } 423 424 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 425 LoopPass::getAnalysisUsage(AU); 426 AU.addRequiredID(LoopSimplifyID); 427 AU.addRequiredID(LCSSAID); 428 AU.addRequired<LoopInfo>(); 429 AU.addRequired<ScalarEvolution>(); 430 AU.addRequired<DominatorTree>(); 431 AU.addPreserved<LoopInfo>(); 432 AU.addPreserved<DominatorTree>(); 433 } 434 435 }; 436 437 Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { 438 // Instructions that access the old induction variable 439 // actually want to get the new one. 440 if (V == OldInduction) 441 V = Induction; 442 // Create the types. 443 LLVMContext &C = V->getContext(); 444 Type *VTy = VectorType::get(V->getType(), VF); 445 Type *I32 = IntegerType::getInt32Ty(C); 446 Constant *Zero = ConstantInt::get(I32, 0); 447 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 448 Value *UndefVal = UndefValue::get(VTy); 449 // Insert the value into a new vector. 450 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 451 // Broadcast the scalar into all locations in the vector. 452 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 453 "broadcast"); 454 // We are accessing the induction variable. Make sure to promote the 455 // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. 456 if (V == Induction) 457 return getConsecutiveVector(Shuf); 458 return Shuf; 459 } 460 461 Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { 462 assert(Val->getType()->isVectorTy() && "Must be a vector"); 463 assert(Val->getType()->getScalarType()->isIntegerTy() && 464 "Elem must be an integer"); 465 // Create the types. 466 Type *ITy = Val->getType()->getScalarType(); 467 VectorType *Ty = cast<VectorType>(Val->getType()); 468 unsigned VLen = Ty->getNumElements(); 469 SmallVector<Constant*, 8> Indices; 470 471 // Create a vector of consecutive numbers from zero to VF. 472 for (unsigned i = 0; i < VLen; ++i) 473 Indices.push_back(ConstantInt::get(ITy, i)); 474 475 // Add the consecutive indices to the vector value. 476 Constant *Cv = ConstantVector::get(Indices); 477 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 478 return Builder.CreateAdd(Val, Cv, "induction"); 479 } 480 481 bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { 482 GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 483 if (!Gep) 484 return false; 485 486 unsigned NumOperands = Gep->getNumOperands(); 487 Value *LastIndex = Gep->getOperand(NumOperands - 1); 488 489 // Check that all of the gep indices are uniform except for the last. 490 for (unsigned i = 0; i < NumOperands - 1; ++i) 491 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 492 return false; 493 494 // We can emit wide load/stores only of the last index is the induction 495 // variable. 496 const SCEV *Last = SE->getSCEV(LastIndex); 497 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 498 const SCEV *Step = AR->getStepRecurrence(*SE); 499 500 // The memory is consecutive because the last index is consecutive 501 // and all other indices are loop invariant. 502 if (Step->isOne()) 503 return true; 504 } 505 506 return false; 507 } 508 509 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { 510 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 511 // If we saved a vectorized copy of V, use it. 512 Value *&MapEntry = WidenMap[V]; 513 if (MapEntry) 514 return MapEntry; 515 516 // Broadcast V and save the value for future uses. 517 Value *B = getBroadcastInstrs(V); 518 MapEntry = B; 519 return B; 520 } 521 522 Constant* 523 SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { 524 SmallVector<Constant*, 8> Indices; 525 // Create a vector of consecutive numbers from zero to VF. 526 for (unsigned i = 0; i < VF; ++i) 527 Indices.push_back(ConstantInt::get(ScalarTy, Val, true)); 528 529 // Add the consecutive indices to the vector value. 530 return ConstantVector::get(Indices); 531 } 532 533 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 534 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 535 // Holds vector parameters or scalars, in case of uniform vals. 536 SmallVector<Value*, 8> Params; 537 538 // Find all of the vectorized parameters. 539 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 540 Value *SrcOp = Instr->getOperand(op); 541 542 // If we are accessing the old induction variable, use the new one. 543 if (SrcOp == OldInduction) { 544 Params.push_back(getBroadcastInstrs(Induction)); 545 continue; 546 } 547 548 // Try using previously calculated values. 549 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 550 551 // If the src is an instruction that appeared earlier in the basic block 552 // then it should already be vectorized. 553 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 554 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 555 // The parameter is a vector value from earlier. 556 Params.push_back(WidenMap[SrcInst]); 557 } else { 558 // The parameter is a scalar from outside the loop. Maybe even a constant. 559 Params.push_back(SrcOp); 560 } 561 } 562 563 assert(Params.size() == Instr->getNumOperands() && 564 "Invalid number of operands"); 565 566 // Does this instruction return a value ? 567 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 568 Value *VecResults = 0; 569 570 // If we have a return value, create an empty vector. We place the scalarized 571 // instructions in this vector. 572 if (!IsVoidRetTy) 573 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 574 575 // For each scalar that we create: 576 for (unsigned i = 0; i < VF; ++i) { 577 Instruction *Cloned = Instr->clone(); 578 if (!IsVoidRetTy) 579 Cloned->setName(Instr->getName() + ".cloned"); 580 // Replace the operands of the cloned instrucions with extracted scalars. 581 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 582 Value *Op = Params[op]; 583 // Param is a vector. Need to extract the right lane. 584 if (Op->getType()->isVectorTy()) 585 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 586 Cloned->setOperand(op, Op); 587 } 588 589 // Place the cloned scalar in the new loop. 590 Builder.Insert(Cloned); 591 592 // If the original scalar returns a value we need to place it in a vector 593 // so that future users will be able to use it. 594 if (!IsVoidRetTy) 595 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 596 Builder.getInt32(i)); 597 } 598 599 if (!IsVoidRetTy) 600 WidenMap[Instr] = VecResults; 601 } 602 603 void 604 SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 605 /* 606 In this function we generate a new loop. The new loop will contain 607 the vectorized instructions while the old loop will continue to run the 608 scalar remainder. 609 610 [ ] <-- vector loop bypass. 611 / | 612 / v 613 | [ ] <-- vector pre header. 614 | | 615 | v 616 | [ ] \ 617 | [ ]_| <-- vector loop. 618 | | 619 \ v 620 >[ ] <--- middle-block. 621 / | 622 / v 623 | [ ] <--- new preheader. 624 | | 625 | v 626 | [ ] \ 627 | [ ]_| <-- old scalar loop to handle remainder. 628 \ | 629 \ v 630 >[ ] <-- exit block. 631 ... 632 */ 633 634 // This is the original scalar-loop preheader. 635 BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 636 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 637 assert(ExitBlock && "Must have an exit block"); 638 639 // The loop index does not have to start at Zero. It starts with this value. 640 OldInduction = Legal->getInduction(); 641 Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock); 642 643 assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); 644 assert(BypassBlock && "Invalid loop structure"); 645 646 BasicBlock *VectorPH = 647 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 648 BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), 649 "vector.body"); 650 651 BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), 652 "middle.block"); 653 BasicBlock *ScalarPH = 654 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), 655 "scalar.preheader"); 656 // Find the induction variable. 657 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 658 assert(OldInduction && "We must have a single phi node."); 659 Type *IdxTy = OldInduction->getType(); 660 661 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 662 // inside the loop. 663 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 664 665 // Generate the induction variable. 666 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 667 Constant *Step = ConstantInt::get(IdxTy, VF); 668 669 // Find the loop boundaries. 670 const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); 671 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 672 673 // Get the total trip count from the count by adding 1. 674 ExitCount = SE->getAddExpr(ExitCount, 675 SE->getConstant(ExitCount->getType(), 1)); 676 677 // Expand the trip count and place the new instructions in the preheader. 678 // Notice that the pre-header does not change, only the loop body. 679 SCEVExpander Exp(*SE, "induction"); 680 Instruction *Loc = BypassBlock->getTerminator(); 681 682 // We may need to extend the index in case there is a type mismatch. 683 // We know that the count starts at zero and does not overflow. 684 // We are using Zext because it should be less expensive. 685 if (ExitCount->getType() != Induction->getType()) 686 ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); 687 688 // Count holds the overall loop count (N). 689 Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); 690 691 // Add the start index to the loop count to get the new end index. 692 Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); 693 694 // Now we need to generate the expression for N - (N % VF), which is 695 // the part that the vectorized body will execute. 696 Constant *CIVF = ConstantInt::get(IdxTy, VF); 697 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 698 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 699 Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, 700 "end.idx.rnd.down", Loc); 701 702 // Now, compare the new count to zero. If it is zero, jump to the scalar part. 703 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 704 IdxEndRoundDown, 705 StartIdx, 706 "cmp.zero", Loc); 707 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 708 // Remove the old terminator. 709 Loc->eraseFromParent(); 710 711 // Add a check in the middle block to see if we have completed 712 // all of the iterations in the first vector loop. 713 // If (N - N%VF) == N, then we *don't* need to run the remainder. 714 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 715 IdxEndRoundDown, "cmp.n", 716 MiddleBlock->getTerminator()); 717 718 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 719 // Remove the old terminator. 720 MiddleBlock->getTerminator()->eraseFromParent(); 721 722 // Create i+1 and fill the PHINode. 723 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 724 Induction->addIncoming(StartIdx, VectorPH); 725 Induction->addIncoming(NextIdx, VecBody); 726 // Create the compare. 727 Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 728 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 729 730 // Now we have two terminators. Remove the old one from the block. 731 VecBody->getTerminator()->eraseFromParent(); 732 733 // Fix the scalar body iteration count. 734 unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); 735 OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown); 736 737 // Get ready to start creating new instructions into the vectorized body. 738 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 739 740 // Register the new loop. 741 Loop* Lp = new Loop(); 742 LPM->insertLoop(Lp, OrigLoop->getParentLoop()); 743 744 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 745 746 Loop *ParentLoop = OrigLoop->getParentLoop(); 747 if (ParentLoop) { 748 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 749 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 750 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 751 } 752 753 // Save the state. 754 LoopVectorPreHeader = VectorPH; 755 LoopScalarPreHeader = ScalarPH; 756 LoopMiddleBlock = MiddleBlock; 757 LoopExitBlock = ExitBlock; 758 LoopVectorBody = VecBody; 759 LoopScalarBody = OldBasicBlock; 760 LoopBypassBlock = BypassBlock; 761 } 762 763 /// This function returns the identity element (or neutral element) for 764 /// the operation K. 765 static unsigned 766 getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { 767 switch (K) { 768 case LoopVectorizationLegality::IntegerXor: 769 case LoopVectorizationLegality::IntegerAdd: 770 case LoopVectorizationLegality::IntegerOr: 771 // Adding, Xoring, Oring zero to a number does not change it. 772 return 0; 773 case LoopVectorizationLegality::IntegerMult: 774 // Multiplying a number by 1 does not change it. 775 return 1; 776 case LoopVectorizationLegality::IntegerAnd: 777 // AND-ing a number with an all-1 value does not change it. 778 return -1; 779 default: 780 llvm_unreachable("Unknown reduction kind"); 781 } 782 } 783 784 void 785 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 786 //===------------------------------------------------===// 787 // 788 // Notice: any optimization or new instruction that go 789 // into the code below should be also be implemented in 790 // the cost-model. 791 // 792 //===------------------------------------------------===// 793 typedef SmallVector<PHINode*, 4> PhiVector; 794 BasicBlock &BB = *OrigLoop->getHeader(); 795 Constant *Zero = ConstantInt::get( 796 IntegerType::getInt32Ty(BB.getContext()), 0); 797 798 // In order to support reduction variables we need to be able to vectorize 799 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 800 // steages. First, we create a new vector PHI node with no incoming edges. 801 // We use this value when we vectorize all of the instructions that use the 802 // PHI. Next, after all of the instructions in the block are complete we 803 // add the new incoming edges to the PHI. At this point all of the 804 // instructions in the basic block are vectorized, so we can use them to 805 // construct the PHI. 806 PhiVector PHIsToFix; 807 808 // For each instruction in the old loop. 809 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 810 Instruction *Inst = it; 811 812 switch (Inst->getOpcode()) { 813 case Instruction::Br: 814 // Nothing to do for PHIs and BR, since we already took care of the 815 // loop control flow instructions. 816 continue; 817 case Instruction::PHI:{ 818 PHINode* P = cast<PHINode>(Inst); 819 // Special handling for the induction var. 820 if (OldInduction == Inst) 821 continue; 822 // This is phase one of vectorizing PHIs. 823 // This has to be a reduction variable. 824 assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); 825 Type *VecTy = VectorType::get(Inst->getType(), VF); 826 WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); 827 PHIsToFix.push_back(P); 828 continue; 829 } 830 case Instruction::Add: 831 case Instruction::FAdd: 832 case Instruction::Sub: 833 case Instruction::FSub: 834 case Instruction::Mul: 835 case Instruction::FMul: 836 case Instruction::UDiv: 837 case Instruction::SDiv: 838 case Instruction::FDiv: 839 case Instruction::URem: 840 case Instruction::SRem: 841 case Instruction::FRem: 842 case Instruction::Shl: 843 case Instruction::LShr: 844 case Instruction::AShr: 845 case Instruction::And: 846 case Instruction::Or: 847 case Instruction::Xor: { 848 // Just widen binops. 849 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 850 Value *A = getVectorValue(Inst->getOperand(0)); 851 Value *B = getVectorValue(Inst->getOperand(1)); 852 853 // Use this vector value for all users of the original instruction. 854 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 855 WidenMap[Inst] = V; 856 857 // Update the NSW, NUW and Exact flags. 858 BinaryOperator *VecOp = cast<BinaryOperator>(V); 859 if (isa<OverflowingBinaryOperator>(BinOp)) { 860 VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 861 VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 862 } 863 if (isa<PossiblyExactOperator>(VecOp)) 864 VecOp->setIsExact(BinOp->isExact()); 865 break; 866 } 867 case Instruction::Select: { 868 // Widen selects. 869 // If the selector is loop invariant we can create a select 870 // instruction with a scalar condition. Otherwise, use vector-select. 871 Value *Cond = Inst->getOperand(0); 872 bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); 873 874 // The condition can be loop invariant but still defined inside the 875 // loop. This means that we can't just use the original 'cond' value. 876 // We have to take the 'vectorized' value and pick the first lane. 877 // Instcombine will make this a no-op. 878 Cond = getVectorValue(Cond); 879 if (InvariantCond) 880 Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); 881 882 Value *Op0 = getVectorValue(Inst->getOperand(1)); 883 Value *Op1 = getVectorValue(Inst->getOperand(2)); 884 WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); 885 break; 886 } 887 888 case Instruction::ICmp: 889 case Instruction::FCmp: { 890 // Widen compares. Generate vector compares. 891 bool FCmp = (Inst->getOpcode() == Instruction::FCmp); 892 CmpInst *Cmp = dyn_cast<CmpInst>(Inst); 893 Value *A = getVectorValue(Inst->getOperand(0)); 894 Value *B = getVectorValue(Inst->getOperand(1)); 895 if (FCmp) 896 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 897 else 898 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 899 break; 900 } 901 902 case Instruction::Store: { 903 // Attempt to issue a wide store. 904 StoreInst *SI = dyn_cast<StoreInst>(Inst); 905 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 906 Value *Ptr = SI->getPointerOperand(); 907 unsigned Alignment = SI->getAlignment(); 908 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 909 // This store does not use GEPs. 910 if (!Legal->isConsecutiveGep(Gep)) { 911 scalarizeInstruction(Inst); 912 break; 913 } 914 915 // The last index does not have to be the induction. It can be 916 // consecutive and be a function of the index. For example A[I+1]; 917 unsigned NumOperands = Gep->getNumOperands(); 918 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); 919 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 920 921 // Create the new GEP with the new induction variable. 922 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 923 Gep2->setOperand(NumOperands - 1, LastIndex); 924 Ptr = Builder.Insert(Gep2); 925 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 926 Value *Val = getVectorValue(SI->getValueOperand()); 927 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 928 break; 929 } 930 case Instruction::Load: { 931 // Attempt to issue a wide load. 932 LoadInst *LI = dyn_cast<LoadInst>(Inst); 933 Type *RetTy = VectorType::get(LI->getType(), VF); 934 Value *Ptr = LI->getPointerOperand(); 935 unsigned Alignment = LI->getAlignment(); 936 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 937 938 // We don't have a gep. Scalarize the load. 939 if (!Legal->isConsecutiveGep(Gep)) { 940 scalarizeInstruction(Inst); 941 break; 942 } 943 944 // The last index does not have to be the induction. It can be 945 // consecutive and be a function of the index. For example A[I+1]; 946 unsigned NumOperands = Gep->getNumOperands(); 947 Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); 948 LastIndex = Builder.CreateExtractElement(LastIndex, Zero); 949 950 // Create the new GEP with the new induction variable. 951 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 952 Gep2->setOperand(NumOperands - 1, LastIndex); 953 Ptr = Builder.Insert(Gep2); 954 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 955 LI = Builder.CreateLoad(Ptr); 956 LI->setAlignment(Alignment); 957 // Use this vector value for all users of the load. 958 WidenMap[Inst] = LI; 959 break; 960 } 961 case Instruction::ZExt: 962 case Instruction::SExt: 963 case Instruction::FPToUI: 964 case Instruction::FPToSI: 965 case Instruction::FPExt: 966 case Instruction::PtrToInt: 967 case Instruction::IntToPtr: 968 case Instruction::SIToFP: 969 case Instruction::UIToFP: 970 case Instruction::Trunc: 971 case Instruction::FPTrunc: 972 case Instruction::BitCast: { 973 /// Vectorize bitcasts. 974 CastInst *CI = dyn_cast<CastInst>(Inst); 975 Value *A = getVectorValue(Inst->getOperand(0)); 976 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 977 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 978 break; 979 } 980 981 default: 982 /// All other instructions are unsupported. Scalarize them. 983 scalarizeInstruction(Inst); 984 break; 985 }// end of switch. 986 }// end of for_each instr. 987 988 // At this point every instruction in the original loop is widended to 989 // a vector form. We are almost done. Now, we need to fix the PHI nodes 990 // that we vectorized. The PHI nodes are currently empty because we did 991 // not want to introduce cycles. Notice that the remaining PHI nodes 992 // that we need to fix are reduction variables. 993 994 // Create the 'reduced' values for each of the induction vars. 995 // The reduced values are the vector values that we scalarize and combine 996 // after the loop is finished. 997 for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); 998 it != e; ++it) { 999 PHINode *RdxPhi = *it; 1000 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); 1001 assert(RdxPhi && "Unable to recover vectorized PHI"); 1002 1003 // Find the reduction variable descriptor. 1004 assert(Legal->getReductionVars()->count(RdxPhi) && 1005 "Unable to find the reduction variable"); 1006 LoopVectorizationLegality::ReductionDescriptor RdxDesc = 1007 (*Legal->getReductionVars())[RdxPhi]; 1008 1009 // We need to generate a reduction vector from the incoming scalar. 1010 // To do so, we need to generate the 'identity' vector and overide 1011 // one of the elements with the incoming scalar reduction. We need 1012 // to do it in the vector-loop preheader. 1013 Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); 1014 1015 // This is the vector-clone of the value that leaves the loop. 1016 Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 1017 Type *VecTy = VectorExit->getType(); 1018 1019 // Find the reduction identity variable. Zero for addition, or, xor, 1020 // one for multiplication, -1 for And. 1021 Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), 1022 VecTy->getScalarType()); 1023 1024 // This vector is the Identity vector where the first element is the 1025 // incoming scalar reduction. 1026 Value *VectorStart = Builder.CreateInsertElement(Identity, 1027 RdxDesc.StartValue, Zero); 1028 1029 1030 // Fix the vector-loop phi. 1031 // We created the induction variable so we know that the 1032 // preheader is the first entry. 1033 BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 1034 1035 // Reductions do not have to start at zero. They can start with 1036 // any loop invariant values. 1037 VecRdxPhi->addIncoming(VectorStart, VecPreheader); 1038 unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1039 Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); 1040 VecRdxPhi->addIncoming(Val, LoopVectorBody); 1041 1042 // Before each round, move the insertion point right between 1043 // the PHIs and the values we are going to write. 1044 // This allows us to write both PHINodes and the extractelement 1045 // instructions. 1046 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 1047 1048 // This PHINode contains the vectorized reduction variable, or 1049 // the initial value vector, if we bypass the vector loop. 1050 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 1051 NewPhi->addIncoming(VectorStart, LoopBypassBlock); 1052 NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); 1053 1054 // Extract the first scalar. 1055 Value *Scalar0 = 1056 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); 1057 // Extract and reduce the remaining vector elements. 1058 for (unsigned i=1; i < VF; ++i) { 1059 Value *Scalar1 = 1060 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); 1061 switch (RdxDesc.Kind) { 1062 case LoopVectorizationLegality::IntegerAdd: 1063 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); 1064 break; 1065 case LoopVectorizationLegality::IntegerMult: 1066 Scalar0 = Builder.CreateMul(Scalar0, Scalar1); 1067 break; 1068 case LoopVectorizationLegality::IntegerOr: 1069 Scalar0 = Builder.CreateOr(Scalar0, Scalar1); 1070 break; 1071 case LoopVectorizationLegality::IntegerAnd: 1072 Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); 1073 break; 1074 case LoopVectorizationLegality::IntegerXor: 1075 Scalar0 = Builder.CreateXor(Scalar0, Scalar1); 1076 break; 1077 default: 1078 llvm_unreachable("Unknown reduction operation"); 1079 } 1080 } 1081 1082 // Now, we need to fix the users of the reduction variable 1083 // inside and outside of the scalar remainder loop. 1084 // We know that the loop is in LCSSA form. We need to update the 1085 // PHI nodes in the exit blocks. 1086 for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 1087 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 1088 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 1089 if (!LCSSAPhi) continue; 1090 1091 // All PHINodes need to have a single entry edge, or two if 1092 // we already fixed them. 1093 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 1094 1095 // We found our reduction value exit-PHI. Update it with the 1096 // incoming bypass edge. 1097 if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 1098 // Add an edge coming from the bypass. 1099 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); 1100 break; 1101 } 1102 }// end of the LCSSA phi scan. 1103 1104 // Fix the scalar loop reduction variable with the incoming reduction sum 1105 // from the vector body and from the backedge value. 1106 int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); 1107 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. 1108 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); 1109 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 1110 }// end of for each redux variable. 1111 } 1112 1113 void SingleBlockLoopVectorizer::updateAnalysis() { 1114 // The original basic block. 1115 SE->forgetLoop(OrigLoop); 1116 1117 // Update the dominator tree information. 1118 assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && 1119 "Entry does not dominate exit."); 1120 1121 DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); 1122 DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 1123 DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); 1124 DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 1125 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 1126 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 1127 1128 DEBUG(DT->verifyAnalysis()); 1129 } 1130 1131 bool LoopVectorizationLegality::canVectorize() { 1132 if (!TheLoop->getLoopPreheader()) { 1133 assert(false && "No preheader!!"); 1134 DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); 1135 return false; 1136 } 1137 1138 // We can only vectorize single basic block loops. 1139 unsigned NumBlocks = TheLoop->getNumBlocks(); 1140 if (NumBlocks != 1) { 1141 DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); 1142 return false; 1143 } 1144 1145 // We need to have a loop header. 1146 BasicBlock *BB = TheLoop->getHeader(); 1147 DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); 1148 1149 // Go over each instruction and look at memory deps. 1150 if (!canVectorizeBlock(*BB)) { 1151 DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); 1152 return false; 1153 } 1154 1155 // ScalarEvolution needs to be able to find the exit count. 1156 const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); 1157 if (ExitCount == SE->getCouldNotCompute()) { 1158 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 1159 return false; 1160 } 1161 1162 // Do not loop-vectorize loops with a tiny trip count. 1163 unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); 1164 if (TC > 0u && TC < TinyTripCountThreshold) { 1165 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 1166 "This loop is not worth vectorizing.\n"); 1167 return false; 1168 } 1169 1170 DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); 1171 1172 // Okay! We can vectorize. At this point we don't have any other mem analysis 1173 // which may limit our maximum vectorization factor, so just return true with 1174 // no restrictions. 1175 return true; 1176 } 1177 1178 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { 1179 // Scan the instructions in the block and look for hazards. 1180 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1181 Instruction *I = it; 1182 1183 PHINode *Phi = dyn_cast<PHINode>(I); 1184 if (Phi) { 1185 // This should not happen because the loop should be normalized. 1186 if (Phi->getNumIncomingValues() != 2) { 1187 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 1188 return false; 1189 } 1190 // We only look at integer phi nodes. 1191 if (!Phi->getType()->isIntegerTy()) { 1192 DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); 1193 return false; 1194 } 1195 1196 if (isInductionVariable(Phi)) { 1197 if (Induction) { 1198 DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); 1199 return false; 1200 } 1201 DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); 1202 Induction = Phi; 1203 continue; 1204 } 1205 if (AddReductionVar(Phi, IntegerAdd)) { 1206 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 1207 continue; 1208 } 1209 if (AddReductionVar(Phi, IntegerMult)) { 1210 DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 1211 continue; 1212 } 1213 if (AddReductionVar(Phi, IntegerOr)) { 1214 DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 1215 continue; 1216 } 1217 if (AddReductionVar(Phi, IntegerAnd)) { 1218 DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 1219 continue; 1220 } 1221 if (AddReductionVar(Phi, IntegerXor)) { 1222 DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 1223 continue; 1224 } 1225 1226 DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 1227 return false; 1228 }// end of PHI handling 1229 1230 // We still don't handle functions. 1231 CallInst *CI = dyn_cast<CallInst>(I); 1232 if (CI) { 1233 DEBUG(dbgs() << "LV: Found a call site.\n"); 1234 return false; 1235 } 1236 1237 // We do not re-vectorize vectors. 1238 if (!VectorType::isValidElementType(I->getType()) && 1239 !I->getType()->isVoidTy()) { 1240 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 1241 return false; 1242 } 1243 1244 // Reduction instructions are allowed to have exit users. 1245 // All other instructions must not have external users. 1246 if (!AllowedExit.count(I)) 1247 //Check that all of the users of the loop are inside the BB. 1248 for (Value::use_iterator it = I->use_begin(), e = I->use_end(); 1249 it != e; ++it) { 1250 Instruction *U = cast<Instruction>(*it); 1251 // This user may be a reduction exit value. 1252 BasicBlock *Parent = U->getParent(); 1253 if (Parent != &BB) { 1254 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 1255 return false; 1256 } 1257 } 1258 } // next instr. 1259 1260 if (!Induction) { 1261 DEBUG(dbgs() << "LV: Did not find an induction var.\n"); 1262 return false; 1263 } 1264 1265 // Don't vectorize if the memory dependencies do not allow vectorization. 1266 if (!canVectorizeMemory(BB)) 1267 return false; 1268 1269 // We now know that the loop is vectorizable! 1270 // Collect variables that will remain uniform after vectorization. 1271 std::vector<Value*> Worklist; 1272 1273 // Start with the conditional branch and walk up the block. 1274 Worklist.push_back(BB.getTerminator()->getOperand(0)); 1275 1276 while (Worklist.size()) { 1277 Instruction *I = dyn_cast<Instruction>(Worklist.back()); 1278 Worklist.pop_back(); 1279 // Look at instructions inside this block. 1280 if (!I) continue; 1281 if (I->getParent() != &BB) continue; 1282 1283 // Stop when reaching PHI nodes. 1284 if (isa<PHINode>(I)) { 1285 assert(I == Induction && "Found a uniform PHI that is not the induction"); 1286 break; 1287 } 1288 1289 // This is a known uniform. 1290 Uniforms.insert(I); 1291 1292 // Insert all operands. 1293 for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { 1294 Worklist.push_back(I->getOperand(i)); 1295 } 1296 } 1297 1298 return true; 1299 } 1300 1301 bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { 1302 typedef SmallVector<Value*, 16> ValueVector; 1303 typedef SmallPtrSet<Value*, 16> ValueSet; 1304 // Holds the Load and Store *instructions*. 1305 ValueVector Loads; 1306 ValueVector Stores; 1307 1308 // Scan the BB and collect legal loads and stores. 1309 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 1310 Instruction *I = it; 1311 1312 // If this is a load, save it. If this instruction can read from memory 1313 // but is not a load, then we quit. Notice that we don't handle function 1314 // calls that read or write. 1315 if (I->mayReadFromMemory()) { 1316 LoadInst *Ld = dyn_cast<LoadInst>(I); 1317 if (!Ld) return false; 1318 if (!Ld->isSimple()) { 1319 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 1320 return false; 1321 } 1322 Loads.push_back(Ld); 1323 continue; 1324 } 1325 1326 // Save store instructions. Abort if other instructions write to memory. 1327 if (I->mayWriteToMemory()) { 1328 StoreInst *St = dyn_cast<StoreInst>(I); 1329 if (!St) return false; 1330 if (!St->isSimple()) { 1331 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 1332 return false; 1333 } 1334 Stores.push_back(St); 1335 } 1336 } // next instr. 1337 1338 // Now we have two lists that hold the loads and the stores. 1339 // Next, we find the pointers that they use. 1340 1341 // Check if we see any stores. If there are no stores, then we don't 1342 // care if the pointers are *restrict*. 1343 if (!Stores.size()) { 1344 DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 1345 return true; 1346 } 1347 1348 // Holds the read and read-write *pointers* that we find. 1349 ValueVector Reads; 1350 ValueVector ReadWrites; 1351 1352 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1353 // multiple times on the same object. If the ptr is accessed twice, once 1354 // for read and once for write, it will only appear once (on the write 1355 // list). This is okay, since we are going to check for conflicts between 1356 // writes and between reads and writes, but not between reads and reads. 1357 ValueSet Seen; 1358 1359 ValueVector::iterator I, IE; 1360 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1361 StoreInst *ST = dyn_cast<StoreInst>(*I); 1362 assert(ST && "Bad StoreInst"); 1363 Value* Ptr = ST->getPointerOperand(); 1364 // If we did *not* see this pointer before, insert it to 1365 // the read-write list. At this phase it is only a 'write' list. 1366 if (Seen.insert(Ptr)) 1367 ReadWrites.push_back(Ptr); 1368 } 1369 1370 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1371 LoadInst *LD = dyn_cast<LoadInst>(*I); 1372 assert(LD && "Bad LoadInst"); 1373 Value* Ptr = LD->getPointerOperand(); 1374 // If we did *not* see this pointer before, insert it to the 1375 // read list. If we *did* see it before, then it is already in 1376 // the read-write list. This allows us to vectorize expressions 1377 // such as A[i] += x; Because the address of A[i] is a read-write 1378 // pointer. This only works if the index of A[i] is consecutive. 1379 // If the address of i is unknown (for example A[B[i]]) then we may 1380 // read a few words, modify, and write a few words, and some of the 1381 // words may be written to the same address. 1382 if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr)) 1383 Reads.push_back(Ptr); 1384 } 1385 1386 // If we write (or read-write) to a single destination and there are no 1387 // other reads in this loop then is it safe to vectorize. 1388 if (ReadWrites.size() == 1 && Reads.size() == 0) { 1389 DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 1390 return true; 1391 } 1392 1393 // Now that the pointers are in two lists (Reads and ReadWrites), we 1394 // can check that there are no conflicts between each of the writes and 1395 // between the writes to the reads. 1396 ValueSet WriteObjects; 1397 ValueVector TempObjects; 1398 1399 // Check that the read-writes do not conflict with other read-write 1400 // pointers. 1401 for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) { 1402 GetUnderlyingObjects(*I, TempObjects, DL); 1403 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1404 it != e; ++it) { 1405 if (!isIdentifiedObject(*it)) { 1406 DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); 1407 return false; 1408 } 1409 if (!WriteObjects.insert(*it)) { 1410 DEBUG(dbgs() << "LV: Found a possible write-write reorder:" 1411 << **it <<"\n"); 1412 return false; 1413 } 1414 } 1415 TempObjects.clear(); 1416 } 1417 1418 /// Check that the reads don't conflict with the read-writes. 1419 for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) { 1420 GetUnderlyingObjects(*I, TempObjects, DL); 1421 for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); 1422 it != e; ++it) { 1423 if (!isIdentifiedObject(*it)) { 1424 DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); 1425 return false; 1426 } 1427 if (WriteObjects.count(*it)) { 1428 DEBUG(dbgs() << "LV: Found a possible read/write reorder:" 1429 << **it <<"\n"); 1430 return false; 1431 } 1432 } 1433 TempObjects.clear(); 1434 } 1435 1436 // All is okay. 1437 return true; 1438 } 1439 1440 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 1441 ReductionKind Kind) { 1442 if (Phi->getNumIncomingValues() != 2) 1443 return false; 1444 1445 // Find the possible incoming reduction variable. 1446 BasicBlock *BB = Phi->getParent(); 1447 int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); 1448 int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. 1449 Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); 1450 1451 // ExitInstruction is the single value which is used outside the loop. 1452 // We only allow for a single reduction value to be used outside the loop. 1453 // This includes users of the reduction, variables (which form a cycle 1454 // which ends in the phi node). 1455 Instruction *ExitInstruction = 0; 1456 1457 // Iter is our iterator. We start with the PHI node and scan for all of the 1458 // users of this instruction. All users must be instructions which can be 1459 // used as reduction variables (such as ADD). We may have a single 1460 // out-of-block user. They cycle must end with the original PHI. 1461 // Also, we can't have multiple block-local users. 1462 Instruction *Iter = Phi; 1463 while (true) { 1464 // Any reduction instr must be of one of the allowed kinds. 1465 if (!isReductionInstr(Iter, Kind)) 1466 return false; 1467 1468 // Did we found a user inside this block ? 1469 bool FoundInBlockUser = false; 1470 // Did we reach the initial PHI node ? 1471 bool FoundStartPHI = false; 1472 1473 // If the instruction has no users then this is a broken 1474 // chain and can't be a reduction variable. 1475 if (Iter->use_empty()) 1476 return false; 1477 1478 // For each of the *users* of iter. 1479 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); 1480 it != e; ++it) { 1481 Instruction *U = cast<Instruction>(*it); 1482 // We already know that the PHI is a user. 1483 if (U == Phi) { 1484 FoundStartPHI = true; 1485 continue; 1486 } 1487 // Check if we found the exit user. 1488 BasicBlock *Parent = U->getParent(); 1489 if (Parent != BB) { 1490 // We must have a single exit instruction. 1491 if (ExitInstruction != 0) 1492 return false; 1493 ExitInstruction = Iter; 1494 } 1495 // We can't have multiple inside users. 1496 if (FoundInBlockUser) 1497 return false; 1498 FoundInBlockUser = true; 1499 Iter = U; 1500 } 1501 1502 // We found a reduction var if we have reached the original 1503 // phi node and we only have a single instruction with out-of-loop 1504 // users. 1505 if (FoundStartPHI && ExitInstruction) { 1506 // This instruction is allowed to have out-of-loop users. 1507 AllowedExit.insert(ExitInstruction); 1508 1509 // Save the description of this reduction variable. 1510 ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); 1511 Reductions[Phi] = RD; 1512 return true; 1513 } 1514 } 1515 } 1516 1517 bool 1518 LoopVectorizationLegality::isReductionInstr(Instruction *I, 1519 ReductionKind Kind) { 1520 switch (I->getOpcode()) { 1521 default: 1522 return false; 1523 case Instruction::PHI: 1524 // possibly. 1525 return true; 1526 case Instruction::Add: 1527 case Instruction::Sub: 1528 return Kind == IntegerAdd; 1529 case Instruction::Mul: 1530 case Instruction::UDiv: 1531 case Instruction::SDiv: 1532 return Kind == IntegerMult; 1533 case Instruction::And: 1534 return Kind == IntegerAnd; 1535 case Instruction::Or: 1536 return Kind == IntegerOr; 1537 case Instruction::Xor: 1538 return Kind == IntegerXor; 1539 } 1540 } 1541 1542 bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 1543 // Check that the PHI is consecutive and starts at zero. 1544 const SCEV *PhiScev = SE->getSCEV(Phi); 1545 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 1546 if (!AR) { 1547 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 1548 return false; 1549 } 1550 const SCEV *Step = AR->getStepRecurrence(*SE); 1551 1552 if (!Step->isOne()) { 1553 DEBUG(dbgs() << "LV: PHI stride does not equal one.\n"); 1554 return false; 1555 } 1556 return true; 1557 } 1558 1559 unsigned 1560 LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { 1561 if (!VTTI) { 1562 DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); 1563 return 1; 1564 } 1565 1566 float Cost = expectedCost(1); 1567 unsigned Width = 1; 1568 DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); 1569 for (unsigned i=2; i <= VF; i*=2) { 1570 // Notice that the vector loop needs to be executed less times, so 1571 // we need to divide the cost of the vector loops by the width of 1572 // the vector elements. 1573 float VectorCost = expectedCost(i) / (float)i; 1574 DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << 1575 (int)VectorCost << ".\n"); 1576 if (VectorCost < Cost) { 1577 Cost = VectorCost; 1578 Width = i; 1579 } 1580 } 1581 1582 DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 1583 return Width; 1584 } 1585 1586 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 1587 // We can only estimate the cost of single basic block loops. 1588 assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); 1589 1590 BasicBlock *BB = TheLoop->getHeader(); 1591 unsigned Cost = 0; 1592 1593 // For each instruction in the old loop. 1594 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 1595 Instruction *Inst = it; 1596 unsigned C = getInstructionCost(Inst, VF); 1597 Cost += C; 1598 DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << 1599 " For instruction: "<< *Inst << "\n"); 1600 } 1601 1602 return Cost; 1603 } 1604 1605 unsigned 1606 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 1607 assert(VTTI && "Invalid vector target transformation info"); 1608 1609 // If we know that this instruction will remain uniform, check the cost of 1610 // the scalar version. 1611 if (Legal->isUniformAfterVectorization(I)) 1612 VF = 1; 1613 1614 Type *RetTy = I->getType(); 1615 Type *VectorTy = ToVectorTy(RetTy, VF); 1616 1617 1618 // TODO: We need to estimate the cost of intrinsic calls. 1619 switch (I->getOpcode()) { 1620 case Instruction::GetElementPtr: 1621 // We mark this instruction as zero-cost because scalar GEPs are usually 1622 // lowered to the intruction addressing mode. At the moment we don't 1623 // generate vector geps. 1624 return 0; 1625 case Instruction::Br: { 1626 return VTTI->getCFInstrCost(I->getOpcode()); 1627 } 1628 case Instruction::PHI: 1629 return 0; 1630 case Instruction::Add: 1631 case Instruction::FAdd: 1632 case Instruction::Sub: 1633 case Instruction::FSub: 1634 case Instruction::Mul: 1635 case Instruction::FMul: 1636 case Instruction::UDiv: 1637 case Instruction::SDiv: 1638 case Instruction::FDiv: 1639 case Instruction::URem: 1640 case Instruction::SRem: 1641 case Instruction::FRem: 1642 case Instruction::Shl: 1643 case Instruction::LShr: 1644 case Instruction::AShr: 1645 case Instruction::And: 1646 case Instruction::Or: 1647 case Instruction::Xor: { 1648 return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); 1649 } 1650 case Instruction::Select: { 1651 SelectInst *SI = cast<SelectInst>(I); 1652 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 1653 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 1654 Type *CondTy = SI->getCondition()->getType(); 1655 if (ScalarCond) 1656 CondTy = VectorType::get(CondTy, VF); 1657 1658 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 1659 } 1660 case Instruction::ICmp: 1661 case Instruction::FCmp: { 1662 Type *ValTy = I->getOperand(0)->getType(); 1663 VectorTy = ToVectorTy(ValTy, VF); 1664 return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); 1665 } 1666 case Instruction::Store: { 1667 StoreInst *SI = cast<StoreInst>(I); 1668 Type *ValTy = SI->getValueOperand()->getType(); 1669 VectorTy = ToVectorTy(ValTy, VF); 1670 1671 if (VF == 1) 1672 return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, 1673 SI->getAlignment(), SI->getPointerAddressSpace()); 1674 1675 // Scalarized stores. 1676 if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { 1677 unsigned Cost = 0; 1678 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 1679 ValTy); 1680 // The cost of extracting from the value vector. 1681 Cost += VF * (ExtCost); 1682 // The cost of the scalar stores. 1683 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 1684 ValTy->getScalarType(), 1685 SI->getAlignment(), 1686 SI->getPointerAddressSpace()); 1687 return Cost; 1688 } 1689 1690 // Wide stores. 1691 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), 1692 SI->getPointerAddressSpace()); 1693 } 1694 case Instruction::Load: { 1695 LoadInst *LI = cast<LoadInst>(I); 1696 1697 if (VF == 1) 1698 return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, 1699 LI->getAlignment(), 1700 LI->getPointerAddressSpace()); 1701 1702 // Scalarized loads. 1703 if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { 1704 unsigned Cost = 0; 1705 unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); 1706 // The cost of inserting the loaded value into the result vector. 1707 Cost += VF * (InCost); 1708 // The cost of the scalar stores. 1709 Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), 1710 RetTy->getScalarType(), 1711 LI->getAlignment(), 1712 LI->getPointerAddressSpace()); 1713 return Cost; 1714 } 1715 1716 // Wide loads. 1717 return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), 1718 LI->getPointerAddressSpace()); 1719 } 1720 case Instruction::ZExt: 1721 case Instruction::SExt: 1722 case Instruction::FPToUI: 1723 case Instruction::FPToSI: 1724 case Instruction::FPExt: 1725 case Instruction::PtrToInt: 1726 case Instruction::IntToPtr: 1727 case Instruction::SIToFP: 1728 case Instruction::UIToFP: 1729 case Instruction::Trunc: 1730 case Instruction::FPTrunc: 1731 case Instruction::BitCast: { 1732 Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 1733 return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 1734 } 1735 default: { 1736 // We are scalarizing the instruction. Return the cost of the scalar 1737 // instruction, plus the cost of insert and extract into vector 1738 // elements, times the vector width. 1739 unsigned Cost = 0; 1740 1741 bool IsVoid = RetTy->isVoidTy(); 1742 1743 unsigned InsCost = (IsVoid ? 0 : 1744 VTTI->getInstrCost(Instruction::InsertElement, 1745 VectorTy)); 1746 1747 unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, 1748 VectorTy); 1749 1750 // The cost of inserting the results plus extracting each one of the 1751 // operands. 1752 Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 1753 1754 // The cost of executing VF copies of the scalar instruction. 1755 Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); 1756 return Cost; 1757 } 1758 }// end of switch. 1759 } 1760 1761 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 1762 if (Scalar->isVoidTy() || VF == 1) 1763 return Scalar; 1764 return VectorType::get(Scalar, VF); 1765 } 1766 1767 } // namespace 1768 1769 char LoopVectorize::ID = 0; 1770 static const char lv_name[] = "Loop Vectorization"; 1771 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 1772 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 1773 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1774 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 1775 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 1776 1777 namespace llvm { 1778 Pass *createLoopVectorizePass() { 1779 return new LoopVectorize(); 1780 } 1781 } 1782 1783