1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is a simple loop vectorizer. We currently only support single block 11 // loops. We have a very simple and restrictive legality check: we need to read 12 // and write from disjoint memory locations. We still don't have a cost model. 13 // This pass has three parts: 14 // 1. The main loop pass that drives the different parts. 15 // 2. LoopVectorizationLegality - A helper class that checks for the legality 16 // of the vectorization. 17 // 3. SingleBlockLoopVectorizer - A helper class that performs the actual 18 // widening of instructions. 19 // 20 //===----------------------------------------------------------------------===// 21 #define LV_NAME "loop-vectorize" 22 #define DEBUG_TYPE LV_NAME 23 #include "llvm/Constants.h" 24 #include "llvm/DerivedTypes.h" 25 #include "llvm/Instructions.h" 26 #include "llvm/LLVMContext.h" 27 #include "llvm/Pass.h" 28 #include "llvm/Analysis/LoopPass.h" 29 #include "llvm/Value.h" 30 #include "llvm/Function.h" 31 #include "llvm/Analysis/Verifier.h" 32 #include "llvm/Module.h" 33 #include "llvm/Type.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/StringExtras.h" 36 #include "llvm/Analysis/AliasAnalysis.h" 37 #include "llvm/Analysis/AliasSetTracker.h" 38 #include "llvm/Transforms/Scalar.h" 39 #include "llvm/Analysis/ScalarEvolution.h" 40 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 41 #include "llvm/Analysis/ScalarEvolutionExpander.h" 42 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 43 #include "llvm/Analysis/ValueTracking.h" 44 #include "llvm/Analysis/LoopInfo.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/Debug.h" 47 #include "llvm/Support/raw_ostream.h" 48 #include "llvm/DataLayout.h" 49 #include "llvm/Transforms/Utils/Local.h" 50 #include <algorithm> 51 using namespace llvm; 52 53 static cl::opt<unsigned> 54 DefaultVectorizationFactor("default-loop-vectorize-width", 55 cl::init(4), cl::Hidden, 56 cl::desc("Set the default loop vectorization width")); 57 58 namespace { 59 60 /// Vectorize a simple loop. This class performs the widening of simple single 61 /// basic block loops into vectors. It does not perform any 62 /// vectorization-legality checks, and just does it. It widens the vectors 63 /// to a given vectorization factor (VF). 64 class SingleBlockLoopVectorizer { 65 public: 66 /// Ctor. 67 SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, 68 LPPassManager *Lpm, unsigned VecWidth): 69 Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), 70 Builder(Se->getContext()), Induction(0), OldInduction(0) { } 71 72 // Perform the actual loop widening (vectorization). 73 void vectorize() { 74 ///Create a new empty loop. Unlink the old loop and connect the new one. 75 createEmptyLoop(); 76 /// Widen each instruction in the old loop to a new one in the new loop. 77 vectorizeLoop(); 78 // register the new loop. 79 cleanup(); 80 } 81 82 private: 83 /// Create an empty loop, based on the loop ranges of the old loop. 84 void createEmptyLoop(); 85 /// Copy and widen the instructions from the old loop. 86 void vectorizeLoop(); 87 /// Insert the new loop to the loop hierarchy and pass manager. 88 void cleanup(); 89 90 /// This instruction is un-vectorizable. Implement it as a sequence 91 /// of scalars. 92 void scalarizeInstruction(Instruction *Instr); 93 94 /// Create a broadcast instruction. This method generates a broadcast 95 /// instruction (shuffle) for loop invariant values and for the induction 96 /// value. If this is the induction variable then we extend it to N, N+1, ... 97 /// this is needed because each iteration in the loop corresponds to a SIMD 98 /// element. 99 Value *getBroadcastInstrs(Value *V); 100 101 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. 102 /// for each element in the vector. Starting from zero. 103 Value *getConsecutiveVector(Value* Val); 104 105 /// Check that the GEP operands are all uniform except for the last index 106 /// which has to be the induction variable. 107 bool isConsecutiveGep(GetElementPtrInst *Gep); 108 109 /// When we go over instructions in the basic block we rely on previous 110 /// values within the current basic block or on loop invariant values. 111 /// When we widen (vectorize) values we place them in the map. If the values 112 /// are not within the map, they have to be loop invariant, so we simply 113 /// broadcast them into a vector. 114 Value *getVectorValue(Value *V); 115 116 typedef DenseMap<Value*, Value*> ValueMap; 117 118 /// The original loop. 119 Loop *Orig; 120 // Scev analysis to use. 121 ScalarEvolution *SE; 122 // Loop Info. 123 LoopInfo *LI; 124 // Loop Pass Manager; 125 LPPassManager *LPM; 126 // The vectorization factor to use. 127 unsigned VF; 128 129 // The builder that we use 130 IRBuilder<> Builder; 131 132 // --- Vectorization state --- 133 134 /// The new Induction variable which was added to the new block. 135 PHINode *Induction; 136 /// The induction variable of the old basic block. 137 PHINode *OldInduction; 138 // Maps scalars to widened vectors. 139 ValueMap WidenMap; 140 }; 141 142 /// Perform the vectorization legality check. This class does not look at the 143 /// profitability of vectorization, only the legality. At the moment the checks 144 /// are very simple and focus on single basic block loops with a constant 145 /// iteration count and no reductions. 146 class LoopVectorizationLegality { 147 public: 148 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): 149 TheLoop(Lp), SE(Se), DL(Dl) { } 150 151 /// Returns the maximum vectorization factor that we *can* use to vectorize 152 /// this loop. This does not mean that it is profitable to vectorize this 153 /// loop, only that it is legal to do so. This may be a large number. We 154 /// can vectorize to any SIMD width below this number. 155 unsigned getLoopMaxVF(); 156 157 private: 158 /// Check if a single basic block loop is vectorizable. 159 /// At this point we know that this is a loop with a constant trip count 160 /// and we only need to check individual instructions. 161 bool canVectorizeBlock(BasicBlock &BB); 162 163 // Check if a pointer value is known to be disjoint. 164 // Example: Alloca, Global, NoAlias. 165 bool isIdentifiedSafeObject(Value* Val); 166 167 /// The loop that we evaluate. 168 Loop *TheLoop; 169 /// Scev analysis. 170 ScalarEvolution *SE; 171 /// DataLayout analysis. 172 DataLayout *DL; 173 }; 174 175 struct LoopVectorize : public LoopPass { 176 static char ID; // Pass identification, replacement for typeid 177 178 LoopVectorize() : LoopPass(ID) { 179 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 180 } 181 182 ScalarEvolution *SE; 183 DataLayout *DL; 184 LoopInfo *LI; 185 186 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 187 // Only vectorize innermost loops. 188 if (!L->empty()) 189 return false; 190 191 SE = &getAnalysis<ScalarEvolution>(); 192 DL = getAnalysisIfAvailable<DataLayout>(); 193 LI = &getAnalysis<LoopInfo>(); 194 195 DEBUG(dbgs() << "LV: Checking a loop in \"" << 196 L->getHeader()->getParent()->getName() << "\"\n"); 197 198 // Check if it is legal to vectorize the loop. 199 LoopVectorizationLegality LVL(L, SE, DL); 200 unsigned MaxVF = LVL.getLoopMaxVF(); 201 202 // Check that we can vectorize using the chosen vectorization width. 203 if (MaxVF < DefaultVectorizationFactor) { 204 DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n"); 205 return false; 206 } 207 208 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n"); 209 210 // If we decided that is is *legal* to vectorizer the loop. Do it. 211 SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); 212 LB.vectorize(); 213 214 DEBUG(verifyFunction(*L->getHeader()->getParent())); 215 return true; 216 } 217 218 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 219 LoopPass::getAnalysisUsage(AU); 220 AU.addRequiredID(LoopSimplifyID); 221 AU.addRequired<LoopInfo>(); 222 AU.addRequired<ScalarEvolution>(); 223 } 224 225 }; 226 227 Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { 228 // Instructions that access the old induction variable 229 // actually want to get the new one. 230 if (V == OldInduction) 231 V = Induction; 232 // Create the types. 233 LLVMContext &C = V->getContext(); 234 Type *VTy = VectorType::get(V->getType(), VF); 235 Type *I32 = IntegerType::getInt32Ty(C); 236 Constant *Zero = ConstantInt::get(I32, 0); 237 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); 238 Value *UndefVal = UndefValue::get(VTy); 239 // Insert the value into a new vector. 240 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); 241 // Broadcast the scalar into all locations in the vector. 242 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, 243 "broadcast"); 244 // We are accessing the induction variable. Make sure to promote the 245 // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. 246 if (V == Induction) 247 return getConsecutiveVector(Shuf); 248 return Shuf; 249 } 250 251 Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { 252 assert(Val->getType()->isVectorTy() && "Must be a vector"); 253 assert(Val->getType()->getScalarType()->isIntegerTy() && 254 "Elem must be an integer"); 255 // Create the types. 256 Type *ITy = Val->getType()->getScalarType(); 257 VectorType *Ty = cast<VectorType>(Val->getType()); 258 unsigned VLen = Ty->getNumElements(); 259 SmallVector<Constant*, 8> Indices; 260 261 // Create a vector of consecutive numbers from zero to VF. 262 for (unsigned i = 0; i < VLen; ++i) 263 Indices.push_back(ConstantInt::get(ITy, i)); 264 265 // Add the consecutive indices to the vector value. 266 Constant *Cv = ConstantVector::get(Indices); 267 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 268 return Builder.CreateAdd(Val, Cv, "induction"); 269 } 270 271 272 bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { 273 if (!Gep) 274 return false; 275 276 unsigned NumOperands = Gep->getNumOperands(); 277 Value *LastIndex = Gep->getOperand(NumOperands - 1); 278 279 // Check that all of the gep indices are uniform except for the last. 280 for (unsigned i = 0; i < NumOperands - 1; ++i) 281 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig)) 282 return false; 283 284 // We can emit wide load/stores only of the last index is the induction 285 // variable. 286 const SCEV *Last = SE->getSCEV(LastIndex); 287 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 288 const SCEV *Step = AR->getStepRecurrence(*SE); 289 290 // The memory is consecutive because the last index is consecutive 291 // and all other indices are loop invariant. 292 if (Step->isOne()) 293 return true; 294 } 295 296 return false; 297 } 298 299 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { 300 // If we saved a vectorized copy of V, use it. 301 ValueMap::iterator it = WidenMap.find(V); 302 if (it != WidenMap.end()) 303 return it->second; 304 305 // Broadcast V and save the value for future uses. 306 Value *B = getBroadcastInstrs(V); 307 WidenMap[V] = B; 308 return B; 309 } 310 311 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 312 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 313 // Holds vector parameters or scalars, in case of uniform vals. 314 SmallVector<Value*, 8> Params; 315 316 // Find all of the vectorized parameters. 317 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 318 Value *SrcOp = Instr->getOperand(op); 319 320 // If we are accessing the old induction variable, use the new one. 321 if (SrcOp == OldInduction) { 322 Params.push_back(getBroadcastInstrs(Induction)); 323 continue; 324 } 325 326 // Try using previously calculated values. 327 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 328 329 // If the src is an instruction that appeared earlier in the basic block 330 // then it should already be vectorized. 331 if (SrcInst && SrcInst->getParent() == Instr->getParent()) { 332 assert(WidenMap.count(SrcInst) && "Source operand is unavailable"); 333 // The parameter is a vector value from earlier. 334 Params.push_back(WidenMap[SrcInst]); 335 } else { 336 // The parameter is a scalar from outside the loop. Maybe even a constant. 337 Params.push_back(SrcOp); 338 } 339 } 340 341 assert(Params.size() == Instr->getNumOperands() && 342 "Invalid number of operands"); 343 344 // Does this instruction return a value ? 345 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 346 Value *VecResults = 0; 347 348 // If we have a return value, create an empty vector. We place the scalarized 349 // instructions in this vector. 350 if (!IsVoidRetTy) 351 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); 352 353 // For each scalar that we create. 354 for (unsigned i = 0; i < VF; ++i) { 355 Instruction *Cloned = Instr->clone(); 356 if (!IsVoidRetTy) 357 Cloned->setName(Instr->getName() + ".cloned"); 358 // Replace the operands of the cloned instrucions with extracted scalars. 359 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 360 Value *Op = Params[op]; 361 // Param is a vector. Need to extract the right lane. 362 if (Op->getType()->isVectorTy()) 363 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); 364 Cloned->setOperand(op, Op); 365 } 366 367 // Place the cloned scalar in the new loop. 368 Builder.Insert(Cloned); 369 370 // If the original scalar returns a value we need to place it in a vector 371 // so that future users will be able to use it. 372 if (!IsVoidRetTy) 373 VecResults = Builder.CreateInsertElement(VecResults, Cloned, 374 Builder.getInt32(i)); 375 } 376 377 if (!IsVoidRetTy) 378 WidenMap[Instr] = VecResults; 379 } 380 381 void SingleBlockLoopVectorizer::createEmptyLoop() { 382 /* 383 In this function we generate a new loop. The new loop will contain 384 the vectorized instructions while the old loop will continue to run the 385 scalar remainder. 386 387 [ ] <-- vector loop bypass. 388 / | 389 / v 390 | [ ] <-- vector pre header. 391 | | 392 | v 393 | [ ] \ 394 | [ ]_| <-- vector loop. 395 | | 396 \ v 397 >[ ] <--- middle-block. 398 / | 399 / v 400 | [ ] <--- new preheader. 401 | | 402 | v 403 | [ ] \ 404 | [ ]_| <-- old scalar loop to handle remainder. 405 \ | 406 \ v 407 >[ ] <-- exit block. 408 ... 409 */ 410 411 // This is the original scalar-loop preheader. 412 BasicBlock *BypassBlock = Orig->getLoopPreheader(); 413 BasicBlock *ExitBlock = Orig->getExitBlock(); 414 assert(ExitBlock && "Must have an exit block"); 415 416 assert(Orig->getNumBlocks() == 1 && "Invalid loop"); 417 assert(BypassBlock && "Invalid loop structure"); 418 419 BasicBlock *VectorPH = 420 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 421 BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), 422 "vector.body"); 423 424 BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), 425 "middle.block"); 426 BasicBlock *ScalarPH = 427 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), 428 "scalar.preheader"); 429 430 // Find the induction variable. 431 BasicBlock *OldBasicBlock = Orig->getHeader(); 432 OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin()); 433 assert(OldInduction && "We must have a single phi node."); 434 Type *IdxTy = OldInduction->getType(); 435 436 // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 437 // inside the loop. 438 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 439 440 // Generate the induction variable. 441 Induction = Builder.CreatePHI(IdxTy, 2, "index"); 442 Constant *Zero = ConstantInt::get(IdxTy, 0); 443 Constant *Step = ConstantInt::get(IdxTy, VF); 444 445 // Find the loop boundaries. 446 const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader()); 447 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 448 449 // Get the total trip count from the count by adding 1. 450 ExitCount = SE->getAddExpr(ExitCount, 451 SE->getConstant(ExitCount->getType(), 1)); 452 453 // Expand the trip count and place the new instructions in the preheader. 454 // Notice that the pre-header does not change, only the loop body. 455 SCEVExpander Exp(*SE, "induction"); 456 Instruction *Loc = BypassBlock->getTerminator(); 457 458 // We may need to extend the index in case there is a type mismatch. 459 // We know that the count starts at zero and does not overflow. 460 // We are using Zext because it should be less expensive. 461 if (ExitCount->getType() != Induction->getType()) 462 ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); 463 464 // Count holds the overall loop count (N). 465 Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); 466 // Now we need to generate the expression for N - (N % VF), which is 467 // the part that the vectorized body will execute. 468 Constant *CIVF = ConstantInt::get(IdxTy, VF); 469 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); 470 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); 471 472 // Now, compare the new count to zero. If it is zero, jump to the scalar part. 473 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, 474 CountRoundDown, ConstantInt::getNullValue(IdxTy), 475 "cmp.zero", Loc); 476 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); 477 // Remove the old terminator. 478 Loc->eraseFromParent(); 479 480 // Add a check in the middle block to see if we have completed 481 // all of the iterations in the first vector loop. 482 // If (N - N%VF) == N, then we *don't* need to run the remainder. 483 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 484 CountRoundDown, "cmp.n", 485 MiddleBlock->getTerminator()); 486 487 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 488 // Remove the old terminator. 489 MiddleBlock->getTerminator()->eraseFromParent(); 490 491 // Create i+1 and fill the PHINode. 492 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 493 Induction->addIncoming(Zero, VectorPH); 494 Induction->addIncoming(NextIdx, VecBody); 495 // Create the compare. 496 Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); 497 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 498 499 // Now we have two terminators. Remove the old one from the block. 500 VecBody->getTerminator()->eraseFromParent(); 501 502 // Fix the scalar body iteration count. 503 unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); 504 OldInduction->setIncomingValue(BlockIdx, CountRoundDown); 505 506 // Get ready to start creating new instructions into the vectorized body. 507 Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 508 509 // Register the new loop. 510 Loop* Lp = new Loop(); 511 LPM->insertLoop(Lp, Orig->getParentLoop()); 512 513 Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 514 515 Loop *ParentLoop = Orig->getParentLoop(); 516 if (ParentLoop) { 517 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 518 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 519 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 520 } 521 } 522 523 void SingleBlockLoopVectorizer::vectorizeLoop() { 524 BasicBlock &BB = *Orig->getHeader(); 525 526 // For each instruction in the old loop. 527 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 528 Instruction *Inst = it; 529 530 switch (Inst->getOpcode()) { 531 case Instruction::PHI: 532 case Instruction::Br: 533 // Nothing to do for PHIs and BR, since we already took care of the 534 // loop control flow instructions. 535 continue; 536 537 case Instruction::Add: 538 case Instruction::FAdd: 539 case Instruction::Sub: 540 case Instruction::FSub: 541 case Instruction::Mul: 542 case Instruction::FMul: 543 case Instruction::UDiv: 544 case Instruction::SDiv: 545 case Instruction::FDiv: 546 case Instruction::URem: 547 case Instruction::SRem: 548 case Instruction::FRem: 549 case Instruction::Shl: 550 case Instruction::LShr: 551 case Instruction::AShr: 552 case Instruction::And: 553 case Instruction::Or: 554 case Instruction::Xor: { 555 // Just widen binops. 556 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); 557 Value *A = getVectorValue(Inst->getOperand(0)); 558 Value *B = getVectorValue(Inst->getOperand(1)); 559 // Use this vector value for all users of the original instruction. 560 WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); 561 break; 562 } 563 case Instruction::Select: { 564 // Widen selects. 565 Value *A = getVectorValue(Inst->getOperand(0)); 566 Value *B = getVectorValue(Inst->getOperand(1)); 567 Value *C = getVectorValue(Inst->getOperand(2)); 568 WidenMap[Inst] = Builder.CreateSelect(A, B, C); 569 break; 570 } 571 572 case Instruction::ICmp: 573 case Instruction::FCmp: { 574 // Widen compares. Generate vector compares. 575 bool FCmp = (Inst->getOpcode() == Instruction::FCmp); 576 CmpInst *Cmp = dyn_cast<CmpInst>(Inst); 577 Value *A = getVectorValue(Inst->getOperand(0)); 578 Value *B = getVectorValue(Inst->getOperand(1)); 579 if (FCmp) 580 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 581 else 582 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); 583 break; 584 } 585 586 case Instruction::Store: { 587 // Attempt to issue a wide store. 588 StoreInst *SI = dyn_cast<StoreInst>(Inst); 589 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); 590 Value *Ptr = SI->getPointerOperand(); 591 unsigned Alignment = SI->getAlignment(); 592 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 593 // This store does not use GEPs. 594 if (!isConsecutiveGep(Gep)) { 595 scalarizeInstruction(Inst); 596 break; 597 } 598 599 // Create the new GEP with the new induction variable. 600 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 601 unsigned NumOperands = Gep->getNumOperands(); 602 Gep2->setOperand(NumOperands - 1, Induction); 603 Ptr = Builder.Insert(Gep2); 604 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); 605 Value *Val = getVectorValue(SI->getValueOperand()); 606 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); 607 break; 608 } 609 case Instruction::Load: { 610 // Attempt to issue a wide load. 611 LoadInst *LI = dyn_cast<LoadInst>(Inst); 612 Type *RetTy = VectorType::get(LI->getType(), VF); 613 Value *Ptr = LI->getPointerOperand(); 614 unsigned Alignment = LI->getAlignment(); 615 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 616 617 // We don't have a gep. Scalarize the load. 618 if (!isConsecutiveGep(Gep)) { 619 scalarizeInstruction(Inst); 620 break; 621 } 622 623 // Create the new GEP with the new induction variable. 624 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 625 unsigned NumOperands = Gep->getNumOperands(); 626 Gep2->setOperand(NumOperands - 1, Induction); 627 Ptr = Builder.Insert(Gep2); 628 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); 629 LI = Builder.CreateLoad(Ptr); 630 LI->setAlignment(Alignment); 631 // Use this vector value for all users of the load. 632 WidenMap[Inst] = LI; 633 break; 634 } 635 case Instruction::ZExt: 636 case Instruction::SExt: 637 case Instruction::FPToUI: 638 case Instruction::FPToSI: 639 case Instruction::FPExt: 640 case Instruction::PtrToInt: 641 case Instruction::IntToPtr: 642 case Instruction::SIToFP: 643 case Instruction::UIToFP: 644 case Instruction::Trunc: 645 case Instruction::FPTrunc: 646 case Instruction::BitCast: { 647 /// Vectorize bitcasts. 648 CastInst *CI = dyn_cast<CastInst>(Inst); 649 Value *A = getVectorValue(Inst->getOperand(0)); 650 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); 651 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); 652 break; 653 } 654 655 default: 656 /// All other instructions are unsupported. Scalarize them. 657 scalarizeInstruction(Inst); 658 break; 659 }// end of switch. 660 }// end of for_each instr. 661 } 662 663 void SingleBlockLoopVectorizer::cleanup() { 664 // The original basic block. 665 SE->forgetLoop(Orig); 666 } 667 668 unsigned LoopVectorizationLegality::getLoopMaxVF() { 669 if (!TheLoop->getLoopPreheader()) { 670 assert(false && "No preheader!!"); 671 DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); 672 return 1; 673 } 674 675 // We can only vectorize single basic block loops. 676 unsigned NumBlocks = TheLoop->getNumBlocks(); 677 if (NumBlocks != 1) { 678 DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); 679 return 1; 680 } 681 682 // We need to have a loop header. 683 BasicBlock *BB = TheLoop->getHeader(); 684 DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); 685 686 // Go over each instruction and look at memory deps. 687 if (!canVectorizeBlock(*BB)) { 688 DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); 689 return 1; 690 } 691 692 // ScalarEvolution needs to be able to find the exit count. 693 const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); 694 if (ExitCount == SE->getCouldNotCompute()) { 695 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 696 return 1; 697 } 698 699 DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); 700 701 // Okay! We can vectorize. At this point we don't have any other mem analysis 702 // which may limit our maximum vectorization factor, so just return the 703 // maximum SIMD size. 704 return DefaultVectorizationFactor; 705 } 706 707 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { 708 // Holds the read and write pointers that we find. 709 typedef SmallVector<Value*, 10> ValueVector; 710 ValueVector Reads; 711 ValueVector Writes; 712 713 SmallPtrSet<Value*, 16> AnalyzedPtrs; 714 unsigned NumPhis = 0; 715 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { 716 Instruction *I = it; 717 718 PHINode *Phi = dyn_cast<PHINode>(I); 719 if (Phi) { 720 NumPhis++; 721 // We only look at integer phi nodes. 722 if (!Phi->getType()->isIntegerTy()) { 723 DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); 724 return false; 725 } 726 727 // If we found an induction variable. 728 if (NumPhis > 1) { 729 DEBUG(dbgs() << "LV: Found more than one PHI.\n"); 730 return false; 731 } 732 733 // This should not happen because the loop should be normalized. 734 if (Phi->getNumIncomingValues() != 2) { 735 DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 736 return false; 737 } 738 739 // Check that the PHI is consecutive and starts at zero. 740 const SCEV *PhiScev = SE->getSCEV(Phi); 741 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 742 if (!AR) { 743 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 744 return false; 745 } 746 747 const SCEV *Step = AR->getStepRecurrence(*SE); 748 const SCEV *Start = AR->getStart(); 749 750 if (!Step->isOne() || !Start->isZero()) { 751 DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); 752 return false; 753 } 754 } 755 756 // If this is a load, record its pointer. If it is not a load, abort. 757 // Notice that we don't handle function calls that read or write. 758 if (I->mayReadFromMemory()) { 759 LoadInst *Ld = dyn_cast<LoadInst>(I); 760 if (!Ld) return false; 761 if (!Ld->isSimple()) { 762 DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 763 return false; 764 } 765 766 Value* Ptr = Ld->getPointerOperand(); 767 if (AnalyzedPtrs.insert(Ptr)) 768 GetUnderlyingObjects(Ptr, Reads, DL); 769 } 770 771 // Record store pointers. Abort on all other instructions that write to 772 // memory. 773 if (I->mayWriteToMemory()) { 774 StoreInst *St = dyn_cast<StoreInst>(I); 775 if (!St) return false; 776 if (!St->isSimple()) { 777 DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 778 return false; 779 } 780 781 Value* Ptr = St->getPointerOperand(); 782 if (AnalyzedPtrs.insert(Ptr)) 783 GetUnderlyingObjects(St->getPointerOperand(), Writes, DL); 784 } 785 786 // We still don't handle functions. 787 CallInst *CI = dyn_cast<CallInst>(I); 788 if (CI) { 789 DEBUG(dbgs() << "LV: Found a call site:"<< 790 CI->getCalledFunction()->getName() << "\n"); 791 return false; 792 } 793 794 // We do not re-vectorize vectors. 795 if (!VectorType::isValidElementType(I->getType()) && 796 !I->getType()->isVoidTy()) { 797 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); 798 return false; 799 } 800 //Check that all of the users of the loop are inside the BB. 801 for (Value::use_iterator it = I->use_begin(), e = I->use_end(); 802 it != e; ++it) { 803 Instruction *U = cast<Instruction>(*it); 804 BasicBlock *Parent = U->getParent(); 805 if (Parent != &BB) { 806 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); 807 return false; 808 } 809 } 810 } // next instr. 811 812 if (NumPhis != 1) { 813 DEBUG(dbgs() << "LV: Did not find a Phi node.\n"); 814 return false; 815 } 816 817 // Check that the underlying objects of the reads and writes are either 818 // disjoint memory locations, or that they are no-alias arguments. 819 ValueVector::iterator r, re, w, we; 820 for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { 821 if (!isIdentifiedSafeObject(*r)) { 822 DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n"); 823 return false; 824 } 825 } 826 827 for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { 828 if (!isIdentifiedSafeObject(*w)) { 829 DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n"); 830 return false; 831 } 832 } 833 834 // Check that there are no multiple write locations to the same pointer. 835 SmallPtrSet<Value*, 8> WritePointerSet; 836 for (w = Writes.begin(), we = Writes.end(); w != we; ++w) { 837 if (!WritePointerSet.insert(*w)) { 838 DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n"); 839 return false; 840 } 841 } 842 843 // Check that the reads and the writes are disjoint. 844 for (r = Reads.begin(), re = Reads.end(); r != re; ++r) { 845 if (WritePointerSet.count(*r)) { 846 DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n"); 847 return false; 848 } 849 } 850 851 // All is okay. 852 return true; 853 } 854 855 /// Checks if the value is a Global variable or if it is an Arguments 856 /// marked with the NoAlias attribute. 857 bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { 858 assert(Val && "Invalid value"); 859 if (dyn_cast<GlobalValue>(Val)) 860 return true; 861 if (dyn_cast<AllocaInst>(Val)) 862 return true; 863 Argument *A = dyn_cast<Argument>(Val); 864 if (!A) 865 return false; 866 return A->hasNoAliasAttr(); 867 } 868 869 } // namespace 870 871 char LoopVectorize::ID = 0; 872 static const char lv_name[] = "Loop Vectorization"; 873 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 874 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 875 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 876 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 877 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 878 879 namespace llvm { 880 Pass *createLoopVectorizePass() { 881 return new LoopVectorize(); 882 } 883 884 } 885 886